# Make sure required packages are ready for Decision Trees # install.packages("rpart") install.packages("rattle") install.packages("rpart.plot") install.packages("RColorBrewer") install.packages("datasets") install.packages("randomForest") install.packages("caret") install.packages("survival") install.packages("survminer") install.packages("Rcpp") install.packages("gtsummary") install.packages("ggplot2") install.packages("LongCART") # library(rpart) library(rattle) library(rpart.plot) library(RColorBrewer) library(datasets) library(randomForest) library(caret) library(survival) library(survminer) library(Rcpp) library(gtsummary) library(ggplot2) library(LongCART) # # Decision tree # # Use the "kyphosis" dataset, included in rpart # Data on 81 children who had corrective spinal surgery # https://www.rdocumentation.org/packages/rpart/versions/4.1.24/topics/kyphosis # KY <- kyphosis dim(KY) # Each child has 4 descriptors head(KY) # # Kyphosis: absent or present if kyphosis (a type of deformation) was present after the surgery (21%, 17 of 81) # Age: age in months of the child # Number: number of virtebrae involved # Start: number of the topmost vertebrae operated on # table(KY[ ,1]) # # Kyphosis was present in 17 of the 81 children after surgery # 21% present # fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis, method="class") rpart.plot(fit) # # How many present and absent in the far-right cluster? # # It contains 23% of the samples .23*81 # of which 58% of the 19 children, or 11, had kyphosis .58*19 # Therefore, 8 did not develop this condition after surgery # # How good was this? Build a confusion matrix of the data. # predictions <- predict(fit, KY, type="class") # confusionMatrix(predictions, KY[ ,1]) # # Survival Tree using rpart # # Use GBSG2: recurrence-free survival (RFS) originated from a prospective randomized clinical trial # conducted by the German Breast Cancer Study Group. # https://pubmed.ncbi.nlm.nih.gov/7931478/ # data(GBSG2) # # The dataset contains the information from 686 breast cancer patients with the following variables: # time (RFS follow-up time in days) # cens (censoring indicator: 0[censored], 1[event]) # horTH (hormonal therapy: yes, no) # age (age in years) # menostat (menopausal status: pre, post) # tsize (tumour size in mm) # tgrade (tumour grade: I, II, III) # pnodes (number of positive nodes) # progrec (level of progesterone receptor[PR] in fmol) # estrec (level of the estrogen receptor in fmol). # # GBSG2$time1 <- GBSG2$time/365.25 GBSG2$horTh1 <- as.numeric(GBSG2$horTh) GBSG2$tgrade1 <- as.numeric(GBSG2$tgrade) GBSG2$menostat1 <- as.numeric(GBSG2$menostat) # We also have to add the subject id if that is not already there. GBSG2$subjid <- 1:nrow(GBSG2) fit <- survival::survfit(survival::Surv(time1, cens == 1) ~ horTh1 + menostat1 + tgrade1, data = GBSG2) plot(fit) # Dtree <- rpart(Surv(time1, cens == 1) ~ horTh1 + age + menostat1 + tsize + tgrade1 + pnodes + progrec + estrec, data = GBSG2) rpart.plot(Dtree) # # add grid.draw.ggsurvplot so that we can save ggsurvplot grid.draw.ggsurvplot <- function(x){ survminer:::print.ggsurvplot(x, newpage = FALSE) } # # Function to plot multiple groups in any order. Just make sure the palette (ppal) matches legend.labs (lleg) # plotsurv_any <- function(title_in, ppos, flg, lleg, ppal, rth, fsz, fit) { survp <- survminer::ggsurvplot( fit, xlab = "Years", ylab = "Survival rate", legend.title = "", legend = "top", legend.labs = lleg, font.legend = flg, break.x.by = 1, ggtheme = theme_classic2(), font.main = 15, font.x = 18, font.y = 18, font.tickslab = c(11, "plain", "black"), palette = ppal, #linetype = c("solid","dashed", "dotdash"), censor = F, risk.table = TRUE, fontsize = fsz, risk.table.title = NULL, risk.table.fontsize = 4, risk.table.y.text = F, risk.table.height = rth, risk.table.pos = "out", risk.table.col = "strata", tables.theme = theme_cleantable(), lienetype = "strata", pval.coord = c(ppos, 0.85), pval.size = 4, pval = T) # print(survp) # Uncomment the next line and include a PATH to the directory for the png-file # ggsave(file = paste("C:/statistics/Survival_Analysis/example",title_in,".png",sep=""), survp, device = "png", height = 5.0, width = 6.5, dpi = 300) } # for (i in 1:length(GBSG2[ ,1])) {GBSG2$group[i] <- 9} for (i in 1:length(GBSG2[ ,1])) { if (GBSG2$pnodes[i] < 4) { if (GBSG2$tsize[i] < 15) {GBSG2$group[i] <- 1} if (GBSG2$tsize[i] >= 15) { if (GBSG2$progrec[i] >= 90) {GBSG2$group[i] <- 2} if (GBSG2$progrec[i] < 90) {GBSG2$group[i] <- 3} } } if (GBSG2$pnodes[i] >= 4) { if (GBSG2$progrec[i] >= 21) {GBSG2$group[i] <- 4} if (GBSG2$progrec[i] < 21) { if (GBSG2$pnodes[i] < 10) {GBSG2$group[i] <- 5} if (GBSG2$pnodes[i] >= 10) {GBSG2$group[i] <- 6} } } } table(GBSG2$group) # fit <- survival::survfit(survival::Surv(time1, cens == 1) ~ group, data = GBSG2) title_in <- "_rpart_example" ppos <- 6.5 flg <- 12 lleg <- c("Group 1","Group 2","Group 3","Group 4","Group 5","Group 6") ppal <- c("darkgreen","blue","purple","pink","orange","red") # rth <- 0.26 rth <- 0.26 fsz <- 7 plotsurv_any(title_in, ppos, flg, lleg, ppal, rth, fsz, fit) # # Random Forest # # # To run a Random Forest you need a large number of samples (so the out-of-bag (36.8% on average) is large enough. # You also need a reasonable number of random variables (Nvar) so that a reasonable subset of them can be # tested at each decision node. The default (mtry) in randomForest is floor(sqrt(Nvar)), so between 16 and 24 variables # will have mtry=4. # # Random Forest, German Credit Data # Use creditability as target # Obtained from https://www.listendata.com/2014/11/random-forest-with-r.html # Data <- read.csv("https://raw.githubusercontent.com/deepanshu88/Datasets/master/UploadedFiles/german_credit.csv") # Check types of variables str(Data) head(Data) # sum(Data[ ,1]) # Check number of rows and columns dim(Data) # Make dependent variable as a factor (categorical) Data$Creditability = as.factor(Data$Creditability) # # Build and plot a regular decision tree # Dtree <- rpart(Creditability ~ ., data=Data,method="class") rpart.plot(Dtree) # # How good was this? # predictions <- predict(Dtree, Data, type="class") # confusionMatrix(predictions, Data[ ,1]) # # Let's be honest # # Split the data into training and testing sets # set.seed(123) trainIndex <- createDataPartition(Data$Creditability, p = 0.7, list = FALSE) trainData <- Data[trainIndex, ] testData <- Data[-trainIndex, ] # Build the model rpartModel <- rpart(Creditability ~ ., data = trainData, method = "class") # Predict the test data predictions <- predict(rpartModel, testData, type = "class") # Generate the confusion matrix confusionMatrix(predictions, testData$Creditability) # # Random Forest # set.seed(123) rf <-randomForest(Creditability~.,data=Data, ntree=500) print(rf) # Run it again rf <-randomForest(Creditability~.,data=Data, ntree=500) print(rf) # Are 500 trees enough? Look at 5000. rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=4) print(rf) plot(rf, type="l") abline(v=500) # Vary mtry for ntree=5000 set.seed(111) # rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=3) print(rf) # rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=4) print(rf) # rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=5) print(rf) # rf <-randomForest(Creditability~.,data=Data, ntree=5000, mtry=6) print(rf) # # ## attributes(rf) ## $names ## [1] "call" "type" "predicted" "err.rate" "confusion" ## [6] "votes" "oob.times" "classes" "importance" "importanceSD" ## [11] "localImportance" "proximity" "ntree" "mtry" "forest" ## [16] "y" # Predicted Class of each sample from votes across all OOB trees rf$predicted # OOB error rate and error for each Class for each tree (stops after 333 trees mecause of "max.print" rf$err.rate # Uses the Predicted versus Actual for each sample to build a confusion matrix with Class errors rf$confusion # Fraction of votes for each Class for each sample (stops after 500) rf$votes # Number of times each sample is OOB rf$oob.times # Importance (Mean Decrease of the Gini Index) of each Variable rf$importance #