# Clustering tutorial # # Get the required packages # require("tidyverse") require("cluster") require("factoextra") require("dendextend") require("NbClust") # # Get the swiss data # data <- swiss # head(data) # # Remove any sample with an NA (necessary) # df <- na.omit(data) # Convert to z-scale so each parameter has an equal weight df <- scale(df) head(df) # # Build distance/dissimilarity matrices # # In dist, method must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" # manhattan_dist <- dist(df, method = "manhattan") euclidean_dist <- dist(df, method = "euclidean") chebyshev_dist <- dist(df, method = "maximum") canberra_dist <- dist(df, method = "canberra") minkowski_p3 <- dist(df, method = "minkowski", p = 3) # # Visualize the Euclidean distance matrix # fviz_dist(euclidean_dist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) # # Heirarchical Clustering # From https://uc-r.github.io/hc_clustering # hclust [in stats package] and agnes [in cluster package] for agglomerative hierarchical clustering (HC) # # For hclust, method is either “complete”, “average”, “single”, or “ward.D” # # Try different combinations # hccm <- hclust(manhattan_dist, method = "complete" ) plot(hccm, cex = 0.6, hang = -1, main = "Complete Linkage, Manhattan Distance", xlab="Provinces") # hcea <- hclust(euclidean_dist, method = "average" ) plot(hcea, cex = 0.6, hang = -1, main = "Average Linkage, Euclidean Distance", xlab="Provinces") # hcsc <- hclust(chebyshev_dist, method = "single" ) plot(hcsc, cex = 0.6, hang = -1, main = "Single Linkage, Chebyshev Distance", xlab="Provinces") # hcwm <- hclust(manhattan_dist, method = "ward.D" ) plot(hcwm, cex = 0.6, hang = -1, main = "Ward's Method, Manhattan Distance", xlab="Provinces") # hcwe <- hclust(canberra_dist, method = "average" ) plot(hcwe, cex = 0.6, hang = -1, main = "Average Linkage, Canberra Distance", xlab="Provinces") # hcwc <- hclust(minkowski_p3, method = "complete" ) plot(hcwc, cex = 0.6, hang = -1, main = "Complete Linkage, Minkowski(p=3) Distance", xlab="Provinces") # # Compute with agnes, which does not need a pre-computed distance matrix. Default metric is euclidean. # see https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/agnes.html # # Only Euclidean (default) and Manhattan are allowed with agnes # hc3 <- agnes(df, method = "ward") pltree(hc3, cex = 0.6, hang = -1, main = "Ward's Method using Euclidean Distances") # hc3a <- agnes(df, method = "average", metric = "manhattan") pltree(hc3a, cex = 0.6, hang = -1, main = "Average Linkage using Manhattan Distances") # # You can also give agnes a distance (dissimilarity) matrix # hc3b <- agnes(chebyshev_dist, method = "single") pltree(hc3b, cex = 0.6, hang = -1, main = "Single Linkage, Chebyshev Distance") # hc3b <- agnes(chebyshev_dist, method = "average") pltree(hc3b, cex = 0.6, hang = -1, main = "Average Linkage, Chebyshev Distance") # hc3b <- agnes(chebyshev_dist, method = "complete") pltree(hc3b, cex = 0.6, hang = -1, main = "Complete Linkage, Chebyshev Distance") # hc3b <- agnes(chebyshev_dist, method = "ward") pltree(hc3b, cex = 0.6, hang = -1, main = "Ward's Method, Chebyshev Distance") # # Compare agglomerative coefficients # hc3 <- agnes(euclidean_dist, method = "single") hc3$ac hc3 <- agnes(euclidean_dist, method = "average") hc3$ac hc3 <- agnes(euclidean_dist, method = "complete") hc3$ac hc3 <- agnes(euclidean_dist, method = "ward") hc3$ac # hc3 <- agnes(manhattan_dist, method = "single") hc3$ac hc3 <- agnes(manhattan_dist, method = "average") hc3$ac hc3 <- agnes(manhattan_dist, method = "complete") hc3$ac hc3 <- agnes(manhattan_dist, method = "ward") hc3$ac # hc3 <- agnes(chebyshev_dist, method = "single") hc3$ac hc3 <- agnes(chebyshev_dist, method = "average") hc3$ac hc3 <- agnes(chebyshev_dist, method = "complete") hc3$ac hc3 <- agnes(chebyshev_dist, method = "ward") hc3$ac # # Average linkage with Minkowski (p = 3) distance, hclust and agnes # hcma <- hclust(minkowski_p3, method = "average" ) plot(hcma, cex = 0.6, hang = -1, main = "Average Linkage, Minkowski (p=3) Distance", xlab="Provinces") hc3m <- agnes(minkowski_p3, method = "average") pltree(hc3m, cex = 0.6, hang = -1, main = "Average Linkage, Minkowski (p=3) Distance") # # Play with dendrograms # hc3a <- agnes(df, method = "ward", metric = "manhattan") # Cut tree into 3 groups pltree(hc3a, cex = 0.6, hang = -1, main = "Ward's Clustering, Manhattan Distance") rect.hclust(hc3a, k = 3, border = 2:4) sub_grp <- cutree(hc3a, k = 3) table(sub_grp) # fviz_cluster(list(data = df, cluster = sub_grp)) # cutree(as.hclust(hc3a), k=3) # Compare two dendrograms hc3a <- agnes(df, method = "ward", metric = "manhattan") hcwe <- hclust(canberra_dist, method = "average" ) # Create two dendrograms dend1 <- as.dendrogram (hc3a) dend2 <- as.dendrogram (hcwe) # Use tanglegram to compare tanglegram(dend1, dend2) # # Costomize tanglegram # dend_list <- dendlist(dend1, dend2) # tanglegram(dend1, dend2, highlight_distinct_edges = FALSE, # Turn-off dashed lines common_subtrees_color_lines = FALSE, # Turn-off line colors common_subtrees_color_branches = TRUE, # Color common branches main = paste("entanglement =", round(entanglement(dend_list), 2)) ) # # An enganglement of 0.9 represents a poor alignment. # Entanglement is in range [0,1] where 0 represents a perfect alignment # # # Divisive hierarchical clustering using diana (cluster package) # dhcm <- diana(df, metric="manhattan", stand=TRUE) plot(dhcm, main="Divisive Clustering, Manhattan Distance", xlab="Provinces") # dhce <- diana(df, metric="euclidean", stand=TRUE) plot(dhce, main="Divisive Clustering, Euclidean Distance", xlab="Provinces") # chebyshev_dist <- dist(df, method = "maximum") dhcc <- diana(chebyshev_dist, diss=TRUE) plot(dhcc, main="Divisive Clustering, Chebyshev Distance", xlab="Provinces") # Create two dendrograms, Manhattan and Euclidean dend3 <- as.dendrogram (dhcm) dend4 <- as.dendrogram (dhce) # Use tanglegram to comparee tanglegram(dend3, dend4) # # Determine the entanglement # dend_list <- dendlist(dend3, dend4) # tanglegram(dend3, dend4, highlight_distinct_edges = FALSE, # Turn-off dashed lines common_subtrees_color_lines = FALSE, # Turn-off line colors common_subtrees_color_branches = TRUE, # Color common branches main = paste("entanglement =", round(entanglement(dend_list), 2)) ) # # How many clusters? # # Plot the total within-cluster sum of squared distances as a function # of the number of clusters to use the elbow method # fviz_nbclust(df, FUN = hcut, method = "wss") # # Use the average silhouette width # # Agglomerative clustering, Manhattan distance # hc3c <- agnes(manhattan_dist, method = "ward") si3 <- silhouette(cutree(hc3c, k=3), dist=manhattan_dist) plot(si3, nmax=80, cex.names=0.3) # # What is the silhouette width of each sample # si3 # # Divisive clustering, Manhattan distance # dhce_si3 <- silhouette(cutree(dhce, k=3), dist=manhattan_dist) plot(dhce_si3, nmax=80, cex.names=0.3) dhce_si3 # # # K-means clustering # # Check for 2 clusters with 25 restarts (often recommended) # https://uc-r.github.io/kmeans_clustering # # PCA plot with clusters delineated # k2 <- kmeans(df, centers = 2, nstart = 25) k2 # fviz_cluster(k2, data = df) # # # Compare different numbers of clusters # k3 <- kmeans(df, centers = 3, nstart = 25) k4 <- kmeans(df, centers = 4, nstart = 25) k5 <- kmeans(df, centers = 5, nstart = 25) # compare PC plots fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2") fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3") fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4") fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5") # # Use the total within-cluster sum of squares to see if the elbow # method can determine the best K. # # This total within cluster sum of squares can be determined from $tot.withinss # k2$tot.withinss k3$tot.withinss k4$tot.withinss k5$tot.withinss # # Let's create a plot # # function to compute total within-cluster sum of square wss <- function(k) { kmeans(df, k, nstart = 25 )$tot.withinss } # Compute and plot wss for k = 1 to k = 15 k.values <- 1:15 set.seed(123) # extract wss for 2-15 clusters wss_values <- map_dbl(k.values, wss) plot(k.values, wss_values, type="b", pch = 19, frame = FALSE, xlab="Number of clusters K", ylab="Total within-clusters sum of squares") # # This can also be done using fviz_nbclust # set.seed(123) fviz_nbclust(df, kmeans, method = "wss") # # Look at the average silhouette width # Remember that you want to maximize this # # Use the silhouette function in the cluster package # # function to compute average silhouette for k clusters avg_sil <- function(k) { km.res <- kmeans(df, centers = k, nstart = 25) ss <- silhouette(km.res$cluster, dist(df)) mean(ss[, 3]) } # Compute and plot wss for k = 2 to k = 15 k.values <- 2:15 # # extract avg silhouette for 2-15 clusters avg_sil_values <- map_dbl(k.values, avg_sil) # plot(k.values, avg_sil_values, type = "b", pch = 19, frame = FALSE, xlab = "Number of clusters K", ylab = "Average Silhouettes") # # This can also be done using fviz_nbclust # fviz_nbclust(df, kmeans, method = "silhouette") # # Take a look at NbClust # https://www.jstatsoft.org/article/view/v061i06 # https://www.rdocumentation.org/packages/NbClust/versions/3.0.1/topics/NbClust # https://cran.r-project.org/web/packages/NbClust/index.html # res<-NbClust(df, distance = "euclidean", min.nc=2, max.nc=8, method = "complete", index = "alllong") # res$All.index res$Best.nc res$Best.partition # # res<-NbClust(df, distance = "euclidean", min.nc=2, max.nc=10, method = "ward.D", index = "all") # res$All.index res$Best.nc res$All.CriticalValues res$Best.partition # res<-NbClust(df, diss=NULL, distance = "euclidean", min.nc=2, max.nc=6, method = "ward.D2", index = "alllong") # res$All.index res$Best.nc res$Best.partition # res<-NbClust(df, diss=NULL, distance = "euclidean", min.nc=2, max.nc=6, method = "kmeans", index = "hubert") res$All.index # res<-NbClust(df, diss=NULL, distance = "manhattan", min.nc=2, max.nc=6, method = "complete", index = "all") res$All.index res$Best.nc res$All.CriticalValues res$Best.partition # # K-means # res <- NbClust(df, distance = "euclidean", method = "kmeans", index = "alllong") # # Use a distance matrix # dis_mat <- dist(df, method="canberra", diag=FALSE) res<-NbClust(df, diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, method = "kmeans", index = "alllong") # res<-NbClust(df, diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, method = "ward.D2", index = "alllong") res$All.index res$Best.nc res$All.CriticalValues res$Best.partition # # Data matrix is not available. Only the dissimilarity matrix is given # In this case, only these indices can be computed : frey, mcclain, cindex, silhouette and dunn # res<-NbClust(diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, method = "ward.D2", index = "silhouette") res$All.index res$Best.nc res$Best.partition # # Sammon map # # do.sammon {Rdimtools} # require("Rdimtools") hcwaeu <- agnes(df, method = "ward", metric = "euclidean") cutree(as.hclust(hcwaeu), k=3) groups_ward_eucl <- as.factor(cutree(as.hclust(hcwaeu), k=3)) # label <- as.factor(rownames(df)) out1 <- do.sammon(df, ndim=2) out2 <- do.sammon(df, ndim=2, initialize="pca") par(mfrow=c(1,2)) plot(out1$Y, pch=19, col=groups_ward_eucl, main="out1:rndproj") plot(out2$Y, pch=19, col=groups_ward_eucl, main="out2:pca") par(mfrow=c(1,1)) # # sammon {MASS} # require("MASS") swiss.sam <- sammon(euclidean_dist, niter=1000) plot(swiss.sam$points, pch=16, col=groups_ward_eucl, main="Sammon Map") # # Use the algorithm from ProjectionBasedClustering # require("ProjectionBasedClustering") Proj <- SammonsMapping(df) plot(Proj$ProjectedPoints, pch=16, col=groups_ward_eucl, main="Sammon Map") # The last two have IDENTICAL results #