Survival Analysis

Kaplan-Meier Survival curve

Survival curve against k or more periods from entering the study is a product of the k observed survival rates for each period, given by the following: \[S(k) = \prod_i^k p_i\] where the proportion surviving the \(i^{th}\) period is given as \[p_i = \frac {r_i - d_i}{r_i}\] where \(r_i\) is the number of alive at the beginning of the period and \(d_i\) is the number of deaths within the period. Let’s see the example below

library(knitr)
surv_time <- c(1,1,4,5,6,8,9,9,12,15,22,25,37,55,72)
outcome <- c(rep("Died",4), "Unknown","Died", "Survived", 
             "Died", "Died", "Unknown", "Died", "Survived", "Died", "Died", "Survived")
treatment <- c(rep(2,5), 1, rep(2,2), rep(1,2), 2, rep(1,4))
age <- c(75, 79, 85, 76, 66, 75, 72, 70, 71, 73, 66, 73, 68, 59, 61)

data <- data.frame(patient_number = 1:15, surv_time, outcome, treatment, age)
kable(data)

patient_number	surv_time	outcome	treatment	age
1	1	Died	2	75
2	1	Died	2	79
3	4	Died	2	85
4	5	Died	2	76
5	6	Unknown	2	66
6	8	Died	1	75
7	9	Survived	2	72
8	9	Died	2	70
9	12	Died	1	71
10	15	Unknown	1	73
11	22	Died	2	66
12	25	Survived	1	73
13	37	Died	1	68
14	55	Died	1	59
15	72	Survived	1	61

Survival curve for the data set (Direct calculation)

data_tr1 <- data[data$treatment == 1,]
data_tr2 <- data[data$treatment == 2,]

survtime_tr1 <- data_tr1$surv_time
survtime_tr2 <- data_tr2$surv_time

data_tr1 <- data_tr1[order(data_tr1$surv_time),]
data_tr1$number_alive <- c(7,6,NA,6, 3, 2,2)
data_tr1$deaths <- c(1, 1, NA, NA, 1, 1, NA)
data_tr1$prop_surv <- (data_tr1$number_alive - data_tr1$deaths) / data_tr1$number_alive
data_tr1$prop_surv[is.na(data_tr1$prop_surv)] <- 1
data_tr1$cum_prop_surv <- cumprod(data_tr1$prop_surv)
data_tr1

##    patient_number surv_time  outcome treatment age number_alive deaths
## 6               6         8     Died         1  75            7      1
## 9               9        12     Died         1  71            6      1
## 10             10        15  Unknown         1  73           NA     NA
## 12             12        25 Survived         1  73            6     NA
## 13             13        37     Died         1  68            3      1
## 14             14        55     Died         1  59            2      1
## 15             15        72 Survived         1  61            2     NA
##    prop_surv cum_prop_surv
## 6  0.8571429     0.8571429
## 9  0.8333333     0.7142857
## 10 1.0000000     0.7142857
## 12 1.0000000     0.7142857
## 13 0.6666667     0.4761905
## 14 0.5000000     0.2380952
## 15 1.0000000     0.2380952

data_tr2 <- data_tr2[order(data_tr2$surv_time),]
data_tr2$number_alive <- c(8,8,6,5,NA, NA, 3, 1)
data_tr2$deaths <- c(0, 2, 1, 1, NA, NA, 1, 1)
data_tr2$prop_surv <- (data_tr2$number_alive - data_tr2$deaths) / data_tr2$number_alive
data_tr2$prop_surv[is.na(data_tr2$prop_surv)] <- 1
data_tr2$cum_prop_surv <- cumprod(data_tr2$prop_surv)
data_tr2

##    patient_number surv_time  outcome treatment age number_alive deaths
## 1               1         1     Died         2  75            8      0
## 2               2         1     Died         2  79            8      2
## 3               3         4     Died         2  85            6      1
## 4               4         5     Died         2  76            5      1
## 5               5         6  Unknown         2  66           NA     NA
## 7               7         9 Survived         2  72           NA     NA
## 8               8         9     Died         2  70            3      1
## 11             11        22     Died         2  66            1      1
##    prop_surv cum_prop_surv
## 1  1.0000000     1.0000000
## 2  0.7500000     0.7500000
## 3  0.8333333     0.6250000
## 4  0.8000000     0.5000000
## 5  1.0000000     0.5000000
## 7  1.0000000     0.5000000
## 8  0.6666667     0.3333333
## 11 0.0000000     0.0000000

Comparing survival curves of two groups using the log rank test (direct calculation)

Comparison of two survival curves can be done using a statistical hypothesis test called the log rank test. The null hypothesis is that there is no difference between the population survival curves (i.e. the probability of an event occurring at any time point is the same for each population)

The test statistic is calculated as follows: \[\chi^2(log \ rank) = \frac{(O_1 - E_1)^2}{E_1} + \frac{(O_2 - E_2)^2}{E_2}\]

Where \(O_1\) and \(O_2\) are the total numbers of observed events in group 1 and 2, respectively and \(E_1\) and \(E_2\) the total numbers of expected events. \(E_1\) and \(E_2\) can be calculated by the followings:

\[E_1 = \sum_{i = 1}^k \frac{d_i}{r_i}r_{1i}\]

\[E_2 = n-E_1\]

where \(r_{1i}\) is the number alive from group 1 at the time of event i and \(n\) is the total number of events. The test statistic is compared with a \(\chi^2\) distribution with 1 degree of freedom. It is a simplified version of a statistic that is often calculated in statistical packages. (Which utilizes variance instead of expected count as shown in below but it’s out of scope of this tutorial)

data <- data[order(data$surv_time),]
data$number_alive <- c(15, NA, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, NA)
data$num_surv_g2 <- c(8, NA, 6, 5, 4, 3, 3, 2, 1, 1, 1, 0, 0, 0, NA)
data$deaths <- c(2, NA, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, NA)
data$risk_death <- data$deaths / data$number_alive
data$exp_num_g2 <- data$num_surv_g2 * data$risk_death
# data$var <- data$num_surv_g2 * (data$deaths/data$number_alive) * 
#   ((data$number_alive - data$deaths)/data$number_alive) * 
#   ((data$number_alive - data$num_surv_g2)/(data$number_alive - 1)) 
data

##    patient_number surv_time  outcome treatment age number_alive
## 1               1         1     Died         2  75           15
## 2               2         1     Died         2  79           NA
## 3               3         4     Died         2  85           13
## 4               4         5     Died         2  76           12
## 5               5         6  Unknown         2  66           11
## 6               6         8     Died         1  75           10
## 7               7         9 Survived         2  72            9
## 8               8         9     Died         2  70            8
## 9               9        12     Died         1  71            7
## 10             10        15  Unknown         1  73            6
## 11             11        22     Died         2  66            5
## 12             12        25 Survived         1  73            4
## 13             13        37     Died         1  68            3
## 14             14        55     Died         1  59            2
## 15             15        72 Survived         1  61           NA
##    num_surv_g2 deaths risk_death exp_num_g2
## 1            8      2 0.13333333  1.0666667
## 2           NA     NA         NA         NA
## 3            6      1 0.07692308  0.4615385
## 4            5      1 0.08333333  0.4166667
## 5            4      0 0.00000000  0.0000000
## 6            3      1 0.10000000  0.3000000
## 7            3      1 0.11111111  0.3333333
## 8            2      0 0.00000000  0.0000000
## 9            1      1 0.14285714  0.1428571
## 10           1      0 0.00000000  0.0000000
## 11           1      1 0.20000000  0.2000000
## 12           0      0 0.00000000  0.0000000
## 13           0      1 0.33333333  0.0000000
## 14           0      1 0.50000000  0.0000000
## 15          NA     NA         NA         NA

n <- sum(na.omit(data$deaths))
var <- sum(na.omit(data$var))

E2 <- sum(na.omit(data$exp_num_g2))
E1 <- n - E2
test.statistic <- (4 - E1)^2/E1 + (6 - E2)^2/E2
# test.statistic2 <- (6 - E2)^2/var


# p-value
pchisq(test.statistic, df = 1, lower.tail = FALSE) # simple

## [1] 0.0322622

# pchisq(test.statistic2, df = 1, lower.tail = FALSE) # survdiff version

Cox’s proportional hazards model (Cox regression)

The log rank test is used to test whether there is a difference between the survival times of different groups but it does not allow other explanatory variables to be taken into account.

Cox’s proportional hazards model is analogous to a multiple regression model and enables the difference between survival times of particular groups of patients to be tested while allowing for other factors. In this model, the response (dependent) variable is the ‘hazard’. The hazard is the probability of dying (or experiencing the event in question) given that patients have survived up to a given point in time, or the risk for death at that moment.

In Cox’s model, the hazard ratio does not depend on time. In other words, if the risk for dying at a particular point in time in one group is, say, twice that in the other group, then at any other time it will still be twice that in the other group.

The model can be written as:

\[h_i(t) = f(t, x_{1},..., x_{k})\]

\[h_i(t) = e^{(\alpha + \beta_1 x_{1}+ ...+\beta_k x_{k})}\]

\[Let \ \ \alpha = ln(h_0(t)), \ \ then\]

\[ln(h(t)) = ln(h_0(t)) + \beta_1x_{1} + ... + \beta_k x_{k}\]

\[ln(\frac{h(t)}{h_0(t)}) = \beta_1x_{1} +...+\beta_k x_{k}\]

Where \(h(t)\) is the hazard at time \(t\), \(x_1, x_2, ..., x_p\) are the explanatory variables, and \(h_0(t)\) is the baseline hazard when all the explanatory variables are zero. The coefficients \(b_1, b_2, ..., b_p\) are estimated from the data.

Because hazard measures the instantaneous risk for death, it is difficult to illustrate it from sample data. Instead, the cumulative hazard function \(H(t)\) can be examined. This can be obtained from the cumulative survival function \(S(t)\) as follows:

\[H(t) = -ln(S(t))\]

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

cox_tr1 <- data_tr1 %>% select(surv_time, cum_prop_surv) %>% mutate(cum_hazard = -log(cum_prop_surv))
cox_tr2 <- data_tr2 %>% select(surv_time, cum_prop_surv) %>% mutate(cum_hazard = -log(cum_prop_surv))
cox_tr1; cox_tr2

##   surv_time cum_prop_surv cum_hazard
## 1         8     0.8571429  0.1541507
## 2        12     0.7142857  0.3364722
## 3        15     0.7142857  0.3364722
## 4        25     0.7142857  0.3364722
## 5        37     0.4761905  0.7419373
## 6        55     0.2380952  1.4350845
## 7        72     0.2380952  1.4350845

##   surv_time cum_prop_surv cum_hazard
## 1         1     1.0000000  0.0000000
## 2         1     0.7500000  0.2876821
## 3         4     0.6250000  0.4700036
## 4         5     0.5000000  0.6931472
## 5         6     0.5000000  0.6931472
## 6         9     0.5000000  0.6931472
## 7         9     0.3333333  1.0986123
## 8        22     0.0000000        Inf

Survival curve for the data set (using package)

library(survival)
library(survminer)

## Loading required package: ggplot2

## Loading required package: ggpubr

## Loading required package: magrittr

ev <- 1*(data$outcome == "Died")
fut <- as.numeric(data$surv_time)
su <- Surv(fut, ev)

# base line
fit0 <- survfit(su~1, data = data)
ggsurvplot(fit0, data = data)

# treatment
fit <- survfit(su~treatment, data = data)
ggsurvplot(fit, data = data, pval = TRUE)

survdiff(su~treatment, data = data)

## Call:
## survdiff(formula = su ~ treatment, data = data)
## 
##             N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=1 7        4     7.08      1.34      5.68
## treatment=2 8        6     2.92      3.25      5.68
## 
##  Chisq= 5.7  on 1 degrees of freedom, p= 0.02

# age
data$age_group <- ifelse(data$age < 65, 0, 1) # arbitrary threshold age < 65
fit <- survfit(su~age_group, data = data)
ggsurvplot(fit, data = data, pval = TRUE)

survdiff(su~treatment, data = data)

## Call:
## survdiff(formula = su ~ treatment, data = data)
## 
##             N Observed Expected (O-E)^2/E (O-E)^2/V
## treatment=1 7        4     7.08      1.34      5.68
## treatment=2 8        6     2.92      3.25      5.68
## 
##  Chisq= 5.7  on 1 degrees of freedom, p= 0.02

# cox regression
fit.coxph <- coxph(su ~ treatment, data = data)
summary(fit.coxph)

## Call:
## coxph(formula = su ~ treatment, data = data)
## 
##   n= 15, number of events= 10 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)  
## treatment 1.8292    6.2290   0.8512 2.149   0.0316 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## treatment     6.229     0.1605     1.175     33.03
## 
## Concordance= 0.715  (se = 0.098 )
## Rsquare= 0.307   (max possible= 0.933 )
## Likelihood ratio test= 5.5  on 1 df,   p=0.02
## Wald test            = 4.62  on 1 df,   p=0.03
## Score (logrank) test = 5.68  on 1 df,   p=0.02

fit.coxph <- coxph(su ~ treatment + age, data = data)
summary(fit.coxph)

## Call:
## coxph(formula = su ~ treatment + age, data = data)
## 
##   n= 15, number of events= 10 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)   
## treatment 1.88484   6.58531  0.96833 1.946  0.05160 . 
## age       0.21739   1.24283  0.08429 2.579  0.00991 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## treatment     6.585     0.1519     0.987    43.936
## age           1.243     0.8046     1.054     1.466
## 
## Concordance= 0.873  (se = 0.116 )
## Rsquare= 0.617   (max possible= 0.933 )
## Likelihood ratio test= 14.41  on 2 df,   p=7e-04
## Wald test            = 9.03  on 2 df,   p=0.01
## Score (logrank) test = 12.61  on 2 df,   p=0.002

ggforest(fit.coxph, data = data)

Reference

Bewick, V., Cheek, L. and Ball, J., 2004. Statistics review 12: survival analysis. Critical care, 8(5), p.389.