###########################################################
#
# Set up the environment
#
###########################################################

#Reset Workspace
rm(list = ls())

#libraries (slide 5)
library(dplyr)
library(tidyr)
library(readxl)
library(survival)
library(ggplot2)
library(ggfortify)
library(readr)


###########################################################
#
# Load Data: Exampple 1, clinical data
#
###########################################################
#load data from cBioPortal downloaded .tsv file (slides 8-11)

#use full file path if not in working dir (slide 12)
in.file.sarc<-"sarc_tcga_pub_clinical_data.tsv" 

#use read_tsv (readr package), not read.table which will put data in a dataframe and mess up the column names
data.sarc.orig<-read_tsv(in.file.sarc)

#examine results to make sure the data looks how we want
head(data.sarc.orig)
dim(data.sarc.orig)
names(data.sarc.orig)

#need to edit survival status
data.sarc.orig$"Overall Survival Status"

#create a new variable that we will make minor changes on from original data
data.sarc<-data.sarc.orig

#Surv() function expects a numerical or logical for event, 0 for censored, 1 for death
data.sarc$event<-as.numeric(sub(":.*","",data.sarc.orig$"Overall Survival Status"))

#we will create a new variable for time too to save typing later
data.sarc$time<-data.sarc.orig$`Overall Survival (Months)`

#survfit() needs formula notation.  If we want overall survival, we need " ~ 1" (intercept)
km.sarc<-survfit(Surv(time,event)~1,data=data.sarc)

#quick look at survival curve before we move on to another example
autoplot(km.sarc)
autoplot(km.sarc, ylim=c(0,1))




###########################################################
#
# Load Data: Example 2, basic/in vivo data
#
###########################################################

#Basic data is rarely "R friendly"
#See excel file and slides 13-14

#constants
in.file.basic<-"survival data.xlsx"
in.sheet.basic<-"For R"

#we know that batch 1 ended on day 14 and batch 2 ended on day 16
last.day<-c(Run1=14,Run2=16)

#read the worksheet into a tibble
data.basic.oirg<-read_xlsx(in.file.basic,in.sheet.basic)

#examine results to make sure it looks how we want
head(data.basic.oirg)
dim(data.basic.oirg)
names(data.basic.oirg)

#pivot the data longer to more easily fine the last entry for each subject
data.basic.long<-pivot_longer(data.basic.oirg,cols = starts_with("Day"),names_prefix = "Day",names_to = "time", values_to = "tumorVolume")

#examine the long data
data.basic.long

#Set time to numeric data type
data.basic.long$time<-as.numeric(data.basic.long$time)
data.basic.long


#remove blanks
data.basic.long.noNA<-data.basic.long[!is.na(data.basic.long$tumorVolume),]
data.basic.long.noNA

#keep only the last time for each subject as this implicitly encodes death/censoring
data.basic.long.lastTime<- data.basic.long.noNA %>% group_by(Subject, Group) %>%
   filter(time == max(time)) %>%
   arrange(Subject,Group,time)
data.basic.long.lastTime

#add event codes
#Surv() function wants TRUE/FALSE, 1/0, or 2/1 (event/censored)
#we know that batch a ended on day 14 and batch b ended on day 16
data.basic.long.lastTime$event<-ifelse(data.basic.long.lastTime$Batch=="Run1",
                                       ifelse(data.basic.long.lastTime$time==last.day["Run1"],0,1),
                                       ifelse(data.basic.long.lastTime$time==last.day["Run2"],0,1))

#did it work?
print(data.basic.long.lastTime,n=48)

#create and examine the Surv object
data.basic.surv<-Surv(time=data.basic.long.lastTime$time,event=data.basic.long.lastTime$event)
data.basic.surv



# Kaplan-Meier plots and log rank test results
###############################################################################

#overall KM curve (slide 16)
km.basic <- survfit(Surv(time,event) ~ 1,data=data.basic.long.lastTime)
summary(km.basic)
summary(km.basic)$table
autoplot(km.basic)


#does batch matter?
km.basic.batch <- survfit(Surv(time,event) ~ Batch,data=data.basic.long.lastTime)
summary(km.basic.batch)
summary(km.basic.batch)$table #slide 19
autoplot(km.basic.batch)
survdiff.batch<-survdiff(Surv(time,event) ~ Batch,data=data.basic.long.lastTime)
survdiff.batch
#reminder, lack of significant difference is not proof of equivalence!!!

#Ignoring Batch, does Group matter? (slide 18)
km.basic.group <- survfit(Surv(time,event) ~ Group,data=data.basic.long.lastTime)
summary(km.basic.group)
autoplot(km.basic.group) #slide 20
survdiff.group<-survdiff(Surv(time,event) ~ Group,data=data.basic.long.lastTime)
survdiff.group #slide 24

#are there significant differences across the four group/batch combinations
#log rank test only compares a single categorical.  Here, a dummy variable is made
#with all individual groups between the two variables. 
survdiff.group.batch<-survdiff(Surv(time,event) ~ Group+Batch,data=data.basic.long.lastTime)
survdiff.group.batch


# Cox regression modeling
###############################################################################
cox.basic.batch<-coxph(Surv(time,event) ~ Batch,data=data.basic.long.lastTime)
summary(cox.basic.batch)

cox.basic.group<-coxph(Surv(time,event) ~ Group,data=data.basic.long.lastTime)
summary(cox.basic.group)

#with a single categorical variable, and assuming proportional hazards,
#Cox is essentially equivalent to Log Rank


#are our hazards proportional? (slide2 26-27)
test.ph <- cox.zph(cox.basic.group)
test.ph #yes, they are (not significant)

#we can run regression on both variables at once using "+"
cox.basic.group.batch<-coxph(Surv(time,event) ~ Group+Batch,data=data.basic.long.lastTime)
summary(cox.basic.group.batch) #slide 28

#use "*" instead of "+" to get Grade, Batch, and their interaction
cox.basic.groupXbatch<-coxph(Surv(time,event) ~ Group*Batch,data=data.basic.long.lastTime)
summary(cox.basic.groupXbatch)

###########################################################
#
# Clinical data analysis
#
###########################################################


# Kaplan-Meier plots and log rank test results
###############################################################################

#survfit() needs formula notation.  If we want overall survival, we need " ~ 1" 
km.sarc<-survfit(Surv(time,event)~1,data=data.sarc)

autoplot(km.sarc)
autoplot(km.sarc, ylim=c(0,1)) #full y axis range to better compare across plots

head(data.sarc)

data.sarc$'Cancer Type'
table(data.sarc$'Cancer Type')

#annoyingly, survfit, surdiff do not allow referencing variables with spaces in the name with backticks
km.sarc.type<-survfit(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc)
autoplot(km.sarc.type, ylim=c(0,1))
survdiff.sarc.type<-survdiff(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc)
survdiff.sarc.type

table(data.sarc$'FNCLCC Grade')
km.sarc.grade<-survfit(Surv(time,event)~ data.sarc$'FNCLCC Grade',data=data.sarc)
autoplot(km.sarc.grade, ylim=c(0,1))
survdiff.sarc.grade<-survdiff(Surv(time,event)~ data.sarc$'FNCLCC Grade',data=data.sarc)
survdiff.sarc.grade

#there are very few grade 1, what happens if we remove them?
table(data.sarc$'FNCLCC Grade')
km.sarc.grade<-survfit(Surv(time,event)~ data.sarc$'FNCLCC Grade'[!data.sarc$'FNCLCC Grade'%in%1],data=data.sarc[!data.sarc$'FNCLCC Grade'%in%1,])
autoplot(km.sarc.grade, ylim=c(0,1))
survdiff.sarc.grade<-survdiff(Surv(time,event)~ data.sarc$'FNCLCC Grade'[!data.sarc$'FNCLCC Grade'%in%1],data=data.sarc[!data.sarc$'FNCLCC Grade'%in%1,])
survdiff.sarc.grade

#another difference from other formula functions in R, 
#  ssurvfit will automatically cast y variables as factors
lm(time~`FNCLCC Grade`,data=data.sarc)
lm(time~as.factor(`FNCLCC Grade`),data=data.sarc)

#let's look at gender
table(data.sarc$'Sex')
km.sarc.sex<-survfit(Surv(time,event)~ data.sarc$'Sex',data=data.sarc)
autoplot(km.sarc.sex, ylim=c(0,1))
survdiff.sarc.sex<-survdiff(Surv(time,event)~ data.sarc$'Sex',data=data.sarc)
survdiff.sarc.sex


# Cox regression modeling
###############################################################################

#multi leveled categorical variable
cox.sarc.type<-coxph(Surv(time,event)~ data.sarc$'Cancer Type',data=data.sarc)
cox.sarc.type  #coefficients are given relative to the first level, "Nerve Sheath Tumor"
#in this case as it comes first alphabetically

#if we want to change the order of the variable, we can give them explicitly as levels of an ordered factor
data.sarc$'Cancer Type level'<-factor(data.sarc$'Cancer Type',levels=c("Soft Tissue Sarcoma","Uterine Sarcoma","Nerve Sheath Tumor"))
km.sarc.type_level<-coxph(Surv(time,event)~ data.sarc$'Cancer Type level',data=data.sarc)
km.sarc.type_level

#let's look at sex
cox.sarc.sex<-coxph(Surv(time,event)~ data.sarc$'Sex',data=data.sarc)
summary(cox.sarc.sex) #same answer as for log rank test

#cox can also handle continuous variables
cox.sarc.age<-coxph(Surv(time,event)~ data.sarc$'Diagnosis Age',data=data.sarc)
summary(cox.sarc.age)

#how do we plot a continuous variable?  One option is to discretize it
data.sarc$'Age Group'<-as.factor(ifelse(data.sarc$'Diagnosis Age'>mean(data.sarc$'Diagnosis Age'),"old","young"))
km.sarc.ageg<-survfit(Surv(time,event)~ data.sarc$'Age Group',data=data.sarc)
km.sarc.ageg.plot<-autoplot(km.sarc.ageg, ylim=c(0,1))
km.sarc.ageg.plot

#with discritization we loos a lot of information (and significance)
survdiff.sarc.ageg<-survdiff(Surv(time,event)~ data.sarc$'Age Group',data=data.sarc)
survdiff.sarc.ageg

# visualizing a continouse predictor variable
###############################################################################


library(contsurvplot) 
#plot_surv_area function requires the "variable" to match the variable given in the coxph model
#this is not possible with spaces in the name so we create a new variable without spaces
data.sarc$Diagnosis_Age<-data.sarc$'Diagnosis Age'
cox.sarc.age2<-coxph(Surv(time,event)~ Diagnosis_Age,data=data.sarc,x=TRUE)
#plot_surv_area function also requries x=TRUE in coxph model
summary(cox.sarc.age2)
plot <- plot_surv_area(data = data.sarc, model=cox.sarc.age2,time = "time", status = "event", variable = "Diagnosis_Age") 
plot