# Make sure required packages are ready for Decision Trees
#
install.packages("rpart")
install.packages("rattle")
install.packages("rpart.plot")
install.packages("RColorBrewer")
install.packages("datasets")
install.packages("randomForest")
install.packages("caret")
install.packages("survival")
install.packages("survminer")
install.packages("Rcpp")
install.packages("gtsummary")
install.packages("ggplot2")
install.packages("LongCART")
#
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(datasets)
library(randomForest)
library(caret)
library(survival)
library(survminer)
library(Rcpp)
library(gtsummary)
library(ggplot2)
library(LongCART)
#
# Decision tree
#
# Use the "kyphosis" dataset, included in rpart
# Data on 81 children who had corrective spinal surgery
# https://www.rdocumentation.org/packages/rpart/versions/4.1.24/topics/kyphosis 
#
KY <- kyphosis
dim(KY)
# Each child has 4 descriptors
head(KY)
#
# Kyphosis: absent or present if kyphosis (a type of deformation) was present after the surgery (21%, 17 of 81)
# Age: age in months of the child
# Number: number of virtebrae involved
# Start: number of the topmost vertebrae operated on
#
table(KY[ ,1])
#
# Kyphosis was present in 17 of the 81 children after surgery
# 21% present
#
fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis, method="class")
rpart.plot(fit)
#
# How many present and absent in the far-right cluster?
#
# It contains 23% of the samples
.23*81
# of which 58% of the 19 children, or 11, had kyphosis
.58*19
# Therefore, 8 did not develop this condition after surgery
#
# How good was this? Build a confusion matrix of the data.
#
predictions <- predict(fit, KY, type="class")
#
confusionMatrix(predictions, KY[ ,1])
#
# Survival Tree using rpart
#
# Use GBSG2: recurrence-free survival (RFS) originated from a prospective randomized clinical trial
#  conducted by the German Breast Cancer Study Group. 
# https://pubmed.ncbi.nlm.nih.gov/7931478/
#
data(GBSG2)
#
# The dataset contains the information from 686 breast cancer patients with the following variables: 
# time (RFS follow-up time in days)
# cens (censoring indicator: 0[censored], 1[event])
# horTH (hormonal therapy: yes, no)
# age (age in years)
# menostat (menopausal status: pre, post)
# tsize (tumour size in mm)
# tgrade (tumour grade: I, II, III)
# pnodes (number of positive nodes)
# progrec (level of progesterone receptor[PR] in fmol)
# estrec (level of the estrogen receptor in fmol).
#
#
GBSG2$time1 <- GBSG2$time/365.25
GBSG2$horTh1 <- as.numeric(GBSG2$horTh)
GBSG2$tgrade1 <- as.numeric(GBSG2$tgrade)
GBSG2$menostat1 <- as.numeric(GBSG2$menostat)
# We also have to add the subject id if that is not already there.
GBSG2$subjid <- 1:nrow(GBSG2)
fit <- survival::survfit(survival::Surv(time1, cens == 1) ~ horTh1 + menostat1 + tgrade1, data = GBSG2)
plot(fit)
#
Dtree <- rpart(Surv(time1, cens == 1) ~ horTh1 + age + menostat1 + tsize + tgrade1 + pnodes + progrec + estrec, data = GBSG2)
rpart.plot(Dtree)
#
# add grid.draw.ggsurvplot so that we can save ggsurvplot
grid.draw.ggsurvplot <- function(x){
  survminer:::print.ggsurvplot(x, newpage = FALSE)
}
#
# Function to plot multiple groups in any order. Just make sure the palette (ppal) matches legend.labs (lleg)
#
plotsurv_any <- function(title_in, ppos, flg, lleg, ppal, rth, fsz, fit) {
  survp <- survminer::ggsurvplot(
    fit,
    xlab = "Years",
    ylab = "Survival rate",
    legend.title = "",
    legend = "top",
    legend.labs = lleg,
    font.legend = flg,
    break.x.by = 1,
    ggtheme = theme_classic2(),
    font.main = 15,
    font.x =  18,
    font.y = 18,
    font.tickslab = c(11, "plain", "black"),
    palette = ppal,
    #linetype = c("solid","dashed", "dotdash"),
    censor = F,
    risk.table = TRUE,
    fontsize = fsz,
    risk.table.title = NULL,
    risk.table.fontsize = 4,
    risk.table.y.text = F,
    risk.table.height = rth,
    risk.table.pos = "out",
    risk.table.col = "strata",
    tables.theme = theme_cleantable(),
    lienetype = "strata",
    pval.coord = c(ppos, 0.85),
    pval.size = 4,
    pval = T)
#
  print(survp)
# Uncomment the next line and include a PATH to the directory for the png-file
#  ggsave(file = paste("C:/statistics/Survival_Analysis/example",title_in,".png",sep=""), survp, device = "png", height = 5.0, width = 6.5, dpi = 300)
}
#
for (i in 1:length(GBSG2[ ,1])) {GBSG2$group[i] <- 9}
for (i in 1:length(GBSG2[ ,1])) {
  if (GBSG2$pnodes[i] < 4) {
    if (GBSG2$tsize[i] < 15) {GBSG2$group[i] <- 1}
    if (GBSG2$tsize[i] >= 15) {
      if (GBSG2$progrec[i] >= 90) {GBSG2$group[i] <- 2}
      if (GBSG2$progrec[i] < 90) {GBSG2$group[i] <- 3}
    }
  }
  if (GBSG2$pnodes[i] >= 4) {
    if (GBSG2$progrec[i] >= 21) {GBSG2$group[i] <- 4}
    if (GBSG2$progrec[i] < 21) {
      if (GBSG2$pnodes[i] < 10) {GBSG2$group[i] <- 5}
      if (GBSG2$pnodes[i] >= 10) {GBSG2$group[i] <- 6}
    }
  }
}
table(GBSG2$group)
#
fit <- survival::survfit(survival::Surv(time1, cens == 1) ~ group, data = GBSG2)
title_in <- "_rpart_example"
ppos <- 6.5
flg <- 12
lleg <- c("Group 1","Group 2","Group 3","Group 4","Group 5","Group 6")
ppal <- c("darkgreen","blue","purple","pink","orange","red")
# rth <- 0.26
rth <- 0.26
fsz <- 7
plotsurv_any(title_in, ppos, flg, lleg, ppal, rth, fsz, fit)
#
# Random Forest
#
#
# To run a Random Forest you need a large number of samples (so the out-of-bag (36.8% on average) is large enough.
# You also need a reasonable number of random variables (Nvar) so that a reasonable subset  of them can be
#  tested at each decision node. The default (mtry) in randomForest is floor(sqrt(Nvar)), so between 16 and 24 variables
#  will have mtry=4.
#
# Random Forest, German Credit Data
# Use creditability as target
# Obtained from https://www.listendata.com/2014/11/random-forest-with-r.html
#
Data <- read.csv("https://raw.githubusercontent.com/deepanshu88/Datasets/master/UploadedFiles/german_credit.csv")
# Check types of variables
str(Data)
head(Data)
#
sum(Data[ ,1])
# Check number of rows and columns
dim(Data)
# Make dependent variable as a factor (categorical)
Data$Creditability = as.factor(Data$Creditability)
#
# Build and plot a regular decision tree
#
Dtree <- rpart(Creditability ~ ., data=Data,method="class")
rpart.plot(Dtree)
#
# How good was this?
#
predictions <- predict(Dtree, Data, type="class")
#
confusionMatrix(predictions, Data[ ,1])
#
# Let's be honest
#
# Split the data into training and testing sets
#
set.seed(123)
trainIndex <- createDataPartition(Data$Creditability, p = 0.7, list = FALSE)
trainData <- Data[trainIndex, ]
testData <- Data[-trainIndex, ]
# Build the model
rpartModel <- rpart(Creditability ~ ., data = trainData, method = "class")
# Predict the test data
predictions <- predict(rpartModel, testData, type = "class")
# Generate the confusion matrix
confusionMatrix(predictions, testData$Creditability)
#
# Random Forest
#
set.seed(123)
rf <-randomForest(Creditability~.,data=Data, ntree=500) 
print(rf)
# Run it again
rf <-randomForest(Creditability~.,data=Data, ntree=500) 
print(rf)
# Are 500 trees enough? Look at 5000.
rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=4) 
print(rf)
plot(rf, type="l")
abline(v=500)
# Vary mtry for ntree=5000
set.seed(111)
#
rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=3) 
print(rf)
#
rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=4) 
print(rf)
#
rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=5) 
print(rf)
#
rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=6) 
print(rf)
#
#
## attributes(rf)
## $names
## [1] "call"            "type"            "predicted"       "err.rate"        "confusion"      
## [6] "votes"           "oob.times"       "classes"         "importance"      "importanceSD"   
## [11] "localImportance" "proximity"       "ntree"           "mtry"            "forest"         
## [16] "y"  
# Predicted Class of each sample from votes across all OOB trees
rf$predicted
# OOB error rate and error for each Class for each tree (stops after 333 trees mecause of "max.print"
rf$err.rate
# Uses the Predicted versus Actual for each sample to build a confusion matrix with Class errors
rf$confusion
# Fraction of votes for each Class for each sample (stops after 500)
rf$votes
# Number of times each sample is OOB
rf$oob.times
# Importance (Mean Decrease of the Gini Index) of each Variable
rf$importance
#