# Clustering tutorial
# 
# Get the required packages
#
require("tidyverse")
require("cluster")
require("factoextra")
require("dendextend")
require("NbClust")
#
# Get the swiss data
#
data <- swiss
#
head(data)
#
# Remove any sample with an NA (necessary)
#
df <- na.omit(data)
# Convert to z-scale so each parameter has an equal weight
df <- scale(df)
head(df)
#
# Build distance/dissimilarity matrices
#
# In dist, method must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
#
manhattan_dist <- dist(df, method = "manhattan")
euclidean_dist <- dist(df, method = "euclidean")
chebyshev_dist <- dist(df, method = "maximum")
canberra_dist <- dist(df, method = "canberra")
minkowski_p3 <- dist(df, method = "minkowski", p = 3)
#
# Visualize the Euclidean distance matrix
#
fviz_dist(euclidean_dist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
#
# Heirarchical Clustering
# From https://uc-r.github.io/hc_clustering
# hclust [in stats package] and agnes [in cluster package] for agglomerative hierarchical clustering (HC)
#
# For hclust, method is either “complete”, “average”, “single”, or “ward.D”
#
# Try different combinations
#
hccm <- hclust(manhattan_dist, method = "complete" )
plot(hccm, cex = 0.6, hang = -1, main = "Complete Linkage, Manhattan Distance", xlab="Provinces")
#
hcea <- hclust(euclidean_dist, method = "average" )
plot(hcea, cex = 0.6, hang = -1, main = "Average Linkage, Euclidean Distance", xlab="Provinces")
#
hcsc <- hclust(chebyshev_dist, method = "single" )
plot(hcsc, cex = 0.6, hang = -1, main = "Single Linkage, Chebyshev Distance", xlab="Provinces")
#
hcwm <- hclust(manhattan_dist, method = "ward.D" )
plot(hcwm, cex = 0.6, hang = -1, main = "Ward's Method, Manhattan Distance", xlab="Provinces")
#
hcwe <- hclust(canberra_dist, method = "average" )
plot(hcwe, cex = 0.6, hang = -1, main = "Average Linkage, Canberra Distance", xlab="Provinces")
#
hcwc <- hclust(minkowski_p3, method = "complete" )
plot(hcwc, cex = 0.6, hang = -1, main = "Complete Linkage, Minkowski(p=3) Distance", xlab="Provinces")
#
# Compute with agnes, which does not need a pre-computed distance matrix. Default metric is euclidean.
# see https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/agnes.html
#
# Only Euclidean (default) and Manhattan are allowed with agnes
#
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Ward's Method using Euclidean Distances") 
#
hc3a <- agnes(df, method = "average", metric = "manhattan")
pltree(hc3a, cex = 0.6, hang = -1, main = "Average Linkage using Manhattan Distances") 
#
# You can also give agnes a distance (dissimilarity) matrix
#
hc3b <- agnes(chebyshev_dist, method = "single")
pltree(hc3b, cex = 0.6, hang = -1, main = "Single Linkage, Chebyshev Distance") 
#
hc3b <- agnes(chebyshev_dist, method = "average")
pltree(hc3b, cex = 0.6, hang = -1, main = "Average Linkage, Chebyshev Distance") 
#
hc3b <- agnes(chebyshev_dist, method = "complete")
pltree(hc3b, cex = 0.6, hang = -1, main = "Complete Linkage, Chebyshev Distance") 
#
hc3b <- agnes(chebyshev_dist, method = "ward")
pltree(hc3b, cex = 0.6, hang = -1, main = "Ward's Method, Chebyshev Distance") 
#
# Compare agglomerative coefficients
#
hc3 <- agnes(euclidean_dist, method = "single")
hc3$ac
hc3 <- agnes(euclidean_dist, method = "average")
hc3$ac
hc3 <- agnes(euclidean_dist, method = "complete")
hc3$ac
hc3 <- agnes(euclidean_dist, method = "ward")
hc3$ac
#
hc3 <- agnes(manhattan_dist, method = "single")
hc3$ac
hc3 <- agnes(manhattan_dist, method = "average")
hc3$ac
hc3 <- agnes(manhattan_dist, method = "complete")
hc3$ac
hc3 <- agnes(manhattan_dist, method = "ward")
hc3$ac
#
hc3 <- agnes(chebyshev_dist, method = "single")
hc3$ac
hc3 <- agnes(chebyshev_dist, method = "average")
hc3$ac
hc3 <- agnes(chebyshev_dist, method = "complete")
hc3$ac
hc3 <- agnes(chebyshev_dist, method = "ward")
hc3$ac
#
# Average linkage with Minkowski (p = 3) distance, hclust and agnes
#
hcma <- hclust(minkowski_p3, method = "average" )
plot(hcma, cex = 0.6, hang = -1, main = "Average Linkage, Minkowski (p=3) Distance", xlab="Provinces")
hc3m <- agnes(minkowski_p3, method = "average")
pltree(hc3m, cex = 0.6, hang = -1, main = "Average Linkage, Minkowski (p=3) Distance") 
#
# Play with dendrograms
#
hc3a <- agnes(df, method = "ward", metric = "manhattan")
# Cut tree into 3 groups
pltree(hc3a, cex = 0.6, hang = -1, main = "Ward's Clustering, Manhattan Distance") 
rect.hclust(hc3a, k = 3, border = 2:4)
sub_grp <- cutree(hc3a, k = 3)
table(sub_grp)
#
fviz_cluster(list(data = df, cluster = sub_grp))
#
cutree(as.hclust(hc3a), k=3)
# Compare two dendrograms
hc3a <- agnes(df, method = "ward", metric = "manhattan")
hcwe <- hclust(canberra_dist, method = "average" )
# Create two dendrograms
dend1 <- as.dendrogram (hc3a)
dend2 <- as.dendrogram (hcwe)
# Use tanglegram to compare
tanglegram(dend1, dend2)
#
# Costomize tanglegram
#
dend_list <- dendlist(dend1, dend2)
#
tanglegram(dend1, dend2,
  highlight_distinct_edges = FALSE,         # Turn-off dashed lines
  common_subtrees_color_lines = FALSE,      # Turn-off line colors
  common_subtrees_color_branches = TRUE,    # Color common branches 
  main = paste("entanglement =", round(entanglement(dend_list), 2))
  )
#
# An enganglement of 0.9 represents a poor alignment.
# Entanglement is in range [0,1] where 0 represents a perfect alignment
#
#
# Divisive hierarchical clustering using diana (cluster package)
#
dhcm <- diana(df, metric="manhattan", stand=TRUE)
plot(dhcm, main="Divisive Clustering, Manhattan Distance", xlab="Provinces")
#
dhce <- diana(df, metric="euclidean", stand=TRUE)
plot(dhce, main="Divisive Clustering, Euclidean Distance", xlab="Provinces")
#
chebyshev_dist <- dist(df, method = "maximum")
dhcc <- diana(chebyshev_dist, diss=TRUE)
plot(dhcc, main="Divisive Clustering, Chebyshev Distance", xlab="Provinces")
# Create two dendrograms, Manhattan and Euclidean
dend3 <- as.dendrogram (dhcm)
dend4 <- as.dendrogram (dhce)
# Use tanglegram to comparee
tanglegram(dend3, dend4)
#
# Determine the entanglement
#
dend_list <- dendlist(dend3, dend4)
#
tanglegram(dend3, dend4,
  highlight_distinct_edges = FALSE,            # Turn-off dashed lines
  common_subtrees_color_lines = FALSE,         # Turn-off line colors
  common_subtrees_color_branches = TRUE,       # Color common branches 
  main = paste("entanglement =", round(entanglement(dend_list), 2))
  )
#
# How many clusters?
#
# Plot the total within-cluster sum of squared distances as a function 
# of the number of clusters to use the elbow method
#
fviz_nbclust(df, FUN = hcut, method = "wss")
#
# Use the average silhouette width
#
# Agglomerative clustering, Manhattan distance
#
hc3c <- agnes(manhattan_dist, method = "ward")
si3 <- silhouette(cutree(hc3c, k=3), dist=manhattan_dist)
plot(si3, nmax=80, cex.names=0.3)
#
# What is the silhouette width of each sample
#
si3
#
# Divisive clustering, Manhattan distance
#
dhce_si3 <- silhouette(cutree(dhce, k=3), dist=manhattan_dist)
plot(dhce_si3, nmax=80, cex.names=0.3)
dhce_si3
#
#
# K-means clustering
#
# Check for 2 clusters with 25 restarts (often recommended)
# https://uc-r.github.io/kmeans_clustering 
#
# PCA plot with clusters delineated
#
k2 <- kmeans(df, centers = 2, nstart = 25)
k2
#
fviz_cluster(k2, data = df)
#
#
# Compare different numbers of clusters
#
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# compare PC plots
fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
#
# Use the total within-cluster sum of squares to see if the elbow
#   method can determine the best K.
#
# This total within cluster sum of squares can be determined from $tot.withinss
#
k2$tot.withinss
k3$tot.withinss
k4$tot.withinss
k5$tot.withinss
#
# Let's create a plot
#
# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(df, k, nstart = 25 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
set.seed(123)
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
#
# This can also be done using fviz_nbclust
#
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
#
# Look at the average silhouette width
# Remember that you want to maximize this
#
# Use the silhouette function in the cluster package
#
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
  km.res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(df))
  mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
#
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
#
plot(k.values, avg_sil_values,
       type = "b", pch = 19, frame = FALSE, 
       xlab = "Number of clusters K",
       ylab = "Average Silhouettes")
#
# This can also be done using fviz_nbclust
#
fviz_nbclust(df, kmeans, method = "silhouette")
#
# Take a look at NbClust
# https://www.jstatsoft.org/article/view/v061i06
# https://www.rdocumentation.org/packages/NbClust/versions/3.0.1/topics/NbClust
# https://cran.r-project.org/web/packages/NbClust/index.html
#
res<-NbClust(df, distance = "euclidean", min.nc=2, max.nc=8, 
            method = "complete", index = "alllong")
#
res$All.index
res$Best.nc
res$Best.partition
#
#
res<-NbClust(df, distance = "euclidean", min.nc=2, max.nc=10, 
            method = "ward.D", index = "all")
#          
res$All.index
res$Best.nc
res$All.CriticalValues
res$Best.partition
#
res<-NbClust(df, diss=NULL, distance = "euclidean", min.nc=2, max.nc=6, 
            method = "ward.D2", index = "alllong") 
#
res$All.index
res$Best.nc
res$Best.partition
#            
res<-NbClust(df, diss=NULL, distance = "euclidean", min.nc=2, max.nc=6, 
            method = "kmeans", index = "hubert")
res$All.index
#
res<-NbClust(df, diss=NULL, distance = "manhattan", min.nc=2, max.nc=6, 
            method = "complete", index = "all") 
res$All.index
res$Best.nc
res$All.CriticalValues
res$Best.partition
#
# K-means
#
res <- NbClust(df, distance = "euclidean", method = "kmeans", index = "alllong")
#
# Use a distance matrix
#
dis_mat <- dist(df, method="canberra", diag=FALSE)
res<-NbClust(df, diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, 
            method = "kmeans", index = "alllong") 
#
res<-NbClust(df, diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, 
            method = "ward.D2", index = "alllong") 
res$All.index
res$Best.nc
res$All.CriticalValues
res$Best.partition
#
# Data matrix is not available. Only the dissimilarity matrix is given
# In this case, only these indices can be computed : frey, mcclain, cindex, silhouette and dunn
#
res<-NbClust(diss=dis_mat, distance = NULL, min.nc=2, max.nc=8, 
            method = "ward.D2", index = "silhouette")
res$All.index
res$Best.nc
res$Best.partition
#
# Sammon map
#
# do.sammon {Rdimtools}
#
require("Rdimtools")
hcwaeu <- agnes(df, method = "ward", metric = "euclidean")
cutree(as.hclust(hcwaeu), k=3)
groups_ward_eucl <- as.factor(cutree(as.hclust(hcwaeu), k=3))
# label <- as.factor(rownames(df))
out1 <- do.sammon(df, ndim=2)
out2 <- do.sammon(df, ndim=2, initialize="pca")
par(mfrow=c(1,2))
plot(out1$Y, pch=19, col=groups_ward_eucl, main="out1:rndproj")
plot(out2$Y, pch=19, col=groups_ward_eucl, main="out2:pca")
par(mfrow=c(1,1))
#
# sammon {MASS}
#
require("MASS")
swiss.sam <- sammon(euclidean_dist, niter=1000)
plot(swiss.sam$points, pch=16, col=groups_ward_eucl, main="Sammon Map")
#
# Use the algorithm from ProjectionBasedClustering
#
require("ProjectionBasedClustering")
Proj <- SammonsMapping(df)
plot(Proj$ProjectedPoints, pch=16, col=groups_ward_eucl, main="Sammon Map")
# The last two have IDENTICAL results
#