########################################################### # # Set up the environment # ########################################################### #Reset Workspace rm(list = ls()) #libraries (slide 5) library(dplyr) library(tidyr) library(readxl) library(survival) library(ggplot2) library(ggfortify) library(readr) ########################################################### # # Load Data: Exampple 1, clinical data # ########################################################### #load data from cBioPortal downloaded .tsv file (slides 8-11) #use full file path if not in working dir (slide 12) in.file.sarc<-"sarc_tcga_pub_clinical_data.tsv" #use read_tsv (readr package), not read.table which will put data in a dataframe and mess up the column names data.sarc.orig<-read_tsv(in.file.sarc) #examine results to make sure the data looks how we want head(data.sarc.orig) dim(data.sarc.orig) names(data.sarc.orig) #need to edit survival status data.sarc.orig$"Overall Survival Status" #create a new variable that we will make minor changes on from original data data.sarc<-data.sarc.orig #Surv() function expects a numerical or logical for event, 0 for censored, 1 for death data.sarc$event<-as.numeric(sub(":.*","",data.sarc.orig$"Overall Survival Status")) #we will create a new variable for time too to save typing later data.sarc$time<-data.sarc.orig$`Overall Survival (Months)` #survfit() needs formula notation. If we want overall survival, we need " ~ 1" (intercept) km.sarc<-survfit(Surv(time,event)~1,data=data.sarc) #quick look at survival curve before we move on to another example autoplot(km.sarc) autoplot(km.sarc, ylim=c(0,1)) ########################################################### # # Load Data: Example 2, basic/in vivo data # ########################################################### #Basic data is rarely "R friendly" #See excel file and slides 13-14 #constants in.file.basic<-"survival data.xlsx" in.sheet.basic<-"For R" #we know that batch 1 ended on day 14 and batch 2 ended on day 16 last.day<-c(Run1=14,Run2=16) #read the worksheet into a tibble data.basic.oirg<-read_xlsx(in.file.basic,in.sheet.basic) #examine results to make sure it looks how we want head(data.basic.oirg) dim(data.basic.oirg) names(data.basic.oirg) #pivot the data longer to more easily fine the last entry for each subject data.basic.long<-pivot_longer(data.basic.oirg,cols = starts_with("Day"),names_prefix = "Day",names_to = "time", values_to = "tumorVolume") #examine the long data data.basic.long #Set time to numeric data type data.basic.long$time<-as.numeric(data.basic.long$time) data.basic.long #remove blanks data.basic.long.noNA<-data.basic.long[!is.na(data.basic.long$tumorVolume),] data.basic.long.noNA #keep only the last time for each subject as this implicitly encodes death/censoring data.basic.long.lastTime<- data.basic.long.noNA %>% group_by(Subject, Group) %>% filter(time == max(time)) %>% arrange(Subject,Group,time) data.basic.long.lastTime #add event codes #Surv() function wants TRUE/FALSE, 1/0, or 2/1 (event/censored) #we know that batch a ended on day 14 and batch b ended on day 16 data.basic.long.lastTime$event<-ifelse(data.basic.long.lastTime$Batch=="Run1", ifelse(data.basic.long.lastTime$time==last.day["Run1"],0,1), ifelse(data.basic.long.lastTime$time==last.day["Run2"],0,1)) #did it work? print(data.basic.long.lastTime,n=48) #create and examine the Surv object data.basic.surv<-Surv(time=data.basic.long.lastTime$time,event=data.basic.long.lastTime$event) data.basic.surv # Kaplan-Meier plots and log rank test results ############################################################################### #overall KM curve (slide 16) km.basic <- survfit(Surv(time,event) ~ 1,data=data.basic.long.lastTime) summary(km.basic) summary(km.basic)$table autoplot(km.basic) #does batch matter? km.basic.batch <- survfit(Surv(time,event) ~ Batch,data=data.basic.long.lastTime) summary(km.basic.batch) summary(km.basic.batch)$table #slide 19 autoplot(km.basic.batch) survdiff.batch<-survdiff(Surv(time,event) ~ Batch,data=data.basic.long.lastTime) survdiff.batch #reminder, lack of significant difference is not proof of equivalence!!! #Ignoring Batch, does Group matter? (slide 18) km.basic.group <- survfit(Surv(time,event) ~ Group,data=data.basic.long.lastTime) summary(km.basic.group) autoplot(km.basic.group) #slide 20 survdiff.group<-survdiff(Surv(time,event) ~ Group,data=data.basic.long.lastTime) survdiff.group #slide 24 #are there significant differences across the four group/batch combinations #log rank test only compares a single categorical. Here, a dummy variable is made #with all individual groups between the two variables. survdiff.group.batch<-survdiff(Surv(time,event) ~ Group+Batch,data=data.basic.long.lastTime) survdiff.group.batch # Cox regression modeling ############################################################################### cox.basic.batch<-coxph(Surv(time,event) ~ Batch,data=data.basic.long.lastTime) summary(cox.basic.batch) cox.basic.group<-coxph(Surv(time,event) ~ Group,data=data.basic.long.lastTime) summary(cox.basic.group) #with a single categorical variable, and assuming proportional hazards, #Cox is essentially equivalent to Log Rank #are our hazards proportional? (slide2 26-27) test.ph <- cox.zph(cox.basic.group) test.ph #yes, they are (not significant) #we can run regression on both variables at once using "+" cox.basic.group.batch<-coxph(Surv(time,event) ~ Group+Batch,data=data.basic.long.lastTime) summary(cox.basic.group.batch) #slide 28 #use "*" instead of "+" to get Grade, Batch, and their interaction cox.basic.groupXbatch<-coxph(Surv(time,event) ~ Group*Batch,data=data.basic.long.lastTime) summary(cox.basic.groupXbatch) ########################################################### # # Clinical data analysis # ########################################################### # Kaplan-Meier plots and log rank test results ############################################################################### #survfit() needs formula notation. If we want overall survival, we need " ~ 1" km.sarc<-survfit(Surv(time,event)~1,data=data.sarc) autoplot(km.sarc) autoplot(km.sarc, ylim=c(0,1)) #full y axis range to better compare across plots head(data.sarc) data.sarc$'Cancer Type' table(data.sarc$'Cancer Type') #annoyingly, survfit, surdiff do not allow referencing variables with spaces in the name with backticks km.sarc.type<-survfit(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc) autoplot(km.sarc.type, ylim=c(0,1)) survdiff.sarc.type<-survdiff(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc) survdiff.sarc.type table(data.sarc$'FNCLCC Grade') km.sarc.grade<-survfit(Surv(time,event)~ data.sarc$'FNCLCC Grade',data=data.sarc) autoplot(km.sarc.grade, ylim=c(0,1)) survdiff.sarc.grade<-survdiff(Surv(time,event)~ data.sarc$'FNCLCC Grade',data=data.sarc) survdiff.sarc.grade #there are very few grade 1, what happens if we remove them? table(data.sarc$'FNCLCC Grade') km.sarc.grade<-survfit(Surv(time,event)~ data.sarc$'FNCLCC Grade'[!data.sarc$'FNCLCC Grade'%in%1],data=data.sarc[!data.sarc$'FNCLCC Grade'%in%1,]) autoplot(km.sarc.grade, ylim=c(0,1)) survdiff.sarc.grade<-survdiff(Surv(time,event)~ data.sarc$'FNCLCC Grade'[!data.sarc$'FNCLCC Grade'%in%1],data=data.sarc[!data.sarc$'FNCLCC Grade'%in%1,]) survdiff.sarc.grade #another difference from other formula functions in R, # ssurvfit will automatically cast y variables as factors lm(time~`FNCLCC Grade`,data=data.sarc) lm(time~as.factor(`FNCLCC Grade`),data=data.sarc) #let's look at gender table(data.sarc$'Sex') km.sarc.sex<-survfit(Surv(time,event)~ data.sarc$'Sex',data=data.sarc) autoplot(km.sarc.sex, ylim=c(0,1)) survdiff.sarc.sex<-survdiff(Surv(time,event)~ data.sarc$'Sex',data=data.sarc) survdiff.sarc.sex # Cox regression modeling ############################################################################### #multi leveled categorical variable cox.sarc.type<-coxph(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc) cox.sarc.type #coefficients are given relative to the first level, "Nerve Sheath Tumor" #in this case as it comes first alphabetically #if we want to change the order of the variable, we can give them explicitly as levels of an ordered factor data.sarc$'Cancer Type level'<-factor(data.sarc$'Cancer Type',levels=c("Soft Tissue Sarcoma","Uterine Sarcoma","Nerve Sheath Tumor")) km.sarc.type_level<-coxph(Surv(time,event)~ data.sarc$'Cancer Type level',data=data.sarc) km.sarc.type_level #let's look at sex cox.sarc.sex<-coxph(Surv(time,event)~ data.sarc$'Sex',data=data.sarc) summary(cox.sarc.sex) #same answer as for log rank test #cox can also handle continuous variables cox.sarc.age<-coxph(Surv(time,event)~ data.sarc$'Diagnosis Age',data=data.sarc) summary(cox.sarc.age) #how do we plot a continuous variable? One option is to discretize it data.sarc$'Age Group'<-as.factor(ifelse(data.sarc$'Diagnosis Age'>mean(data.sarc$'Diagnosis Age'),"old","young")) km.sarc.ageg<-survfit(Surv(time,event)~ data.sarc$'Age Group',data=data.sarc) km.sarc.ageg.plot<-autoplot(km.sarc.ageg, ylim=c(0,1)) km.sarc.ageg.plot #with discritization we loos a lot of information (and significance) survdiff.sarc.ageg<-survdiff(Surv(time,event)~ data.sarc$'Age Group',data=data.sarc) survdiff.sarc.ageg # visualizing a continouse predictor variable ############################################################################### library(contsurvplot) #plot_surv_area function requires the "variable" to match the variable given in the coxph model #this is not possible with spaces in the name so we create a new variable without spaces data.sarc$Diagnosis_Age<-data.sarc$'Diagnosis Age' cox.sarc.age2<-coxph(Surv(time,event)~ Diagnosis_Age,data=data.sarc,x=TRUE) #plot_surv_area function also requries x=TRUE in coxph model summary(cox.sarc.age2) plot <- plot_surv_area(data = data.sarc, model=cox.sarc.age2,time = "time", status = "event", variable = "Diagnosis_Age") plot