1 Clustering Techniques

1.1 Load Libraries

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)

if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

if(! "pROC" %in% installed.packages()) { install.packages("pROC", dependencies = TRUE) }
library(pROC)

if(! "caret" %in% installed.packages()) { install.packages("caret", dependencies = TRUE) }
library(caret)

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Global Settings
options(digits =   4)
options(scipen = 999)

# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

1.2 Fix Broken Functions

# Loading other packages if not available
if(! "factoextra" %in% installed.packages()) { install.packages("factoextra", dependencies = TRUE) }
library(factoextra)
if(! "NbClust" %in% installed.packages()) { install.packages("NbClust", dependencies = TRUE) }
library(NbClust)

# Fix the Functions
fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss", 
                                           "gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(), 
          barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue", 
          print.summary = TRUE, ...) 
{
  set.seed(123)
  if (k.max < 2) 
    stop("k.max must bet > = 2")
  method = match.arg(method)
  if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in% 
                                                  names(x))) 
    stop("x should be an object of class matrix/data.frame or ", 
         "an object created by the function NbClust() [NbClust package].")
  if (inherits(x, "list") & "Best.nc" %in% names(x)) {
    best_nc <- x$Best.nc
    if (any(class(best_nc) == "numeric") ) 
      print(best_nc)
    else if (any(class(best_nc) == "matrix") )
      .viz_NbClust(x, print.summary, barfill, barcolor)
  }
  else if (is.null(FUNcluster)) 
    stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
  else if (!is.function(FUNcluster)) {
    stop("The argument FUNcluster should be a function. ", 
         "Check if you're not overriding the specified function name somewhere.")
  }
  else if (method %in% c("silhouette", "wss")) {
    if (is.data.frame(x)) 
      x <- as.matrix(x)
    if (is.null(diss)) 
      diss <- stats::dist(x)
    v <- rep(0, k.max)
    if (method == "silhouette") {
      for (i in 2:k.max) {
        clust <- FUNcluster(x, i, ...)
        v[i] <- .get_ave_sil_width(diss, clust$cluster)
      }
    }
    else if (method == "wss") {
      for (i in 1:k.max) {
        clust <- FUNcluster(x, i, ...)
        v[i] <- .get_withinSS(diss, clust$cluster)
      }
    }
    df <- data.frame(clusters = as.factor(1:k.max), y = v, 
                     stringsAsFactors = TRUE)
    ylab <- "Total Within Sum of Square"
    if (method == "silhouette") 
      ylab <- "Average silhouette width"
    p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1, 
                        color = linecolor, ylab = ylab, xlab = "Number of clusters k", 
                        main = "Optimal number of clusters")
    if (method == "silhouette") 
      p <- p + geom_vline(xintercept = which.max(v), linetype = 2, 
                          color = linecolor)
    return(p)
  }
  else if (method == "gap_stat") {
    extra_args <- list(...)
    gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max, 
                                 B = nboot, verbose = verbose, ...)
    if (!is.null(extra_args$maxSE)) 
      maxSE <- extra_args$maxSE
    else maxSE <- list(method = "firstSEmax", SE.factor = 1)
    p <- fviz_gap_stat(gap_stat, linecolor = linecolor, 
                       maxSE = maxSE)
    return(p)
  }
}

.viz_NbClust <- function (x, print.summary = TRUE, barfill = "steelblue", 
          barcolor = "steelblue") 
{
  best_nc <- x$Best.nc
  if (any(class(best_nc) == "numeric") )
    print(best_nc)
  else if (any(class(best_nc) == "matrix") ) {
    best_nc <- as.data.frame(t(best_nc), stringsAsFactors = TRUE)
    best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
    if (print.summary) {
      ss <- summary(best_nc$Number_clusters)
      cat("Among all indices: \n===================\n")
      for (i in 1:length(ss)) {
        cat("*", ss[i], "proposed ", names(ss)[i], 
            "as the best number of clusters\n")
      }
      cat("\nConclusion\n=========================\n")
      cat("* According to the majority rule, the best number of clusters is ", 
          names(which.max(ss)), ".\n\n")
    }
    df <- data.frame(Number_clusters = names(ss), freq = ss, 
                     stringsAsFactors = TRUE)
    p <- ggpubr::ggbarplot(df, x = "Number_clusters", 
                           y = "freq", fill = barfill, color = barcolor) + 
      labs(x = "Number of clusters k", y = "Frequency among all indices", 
           title = paste0("Optimal number of clusters - k = ", 
                          names(which.max(ss))))
    return(p)
  }
}

# Get the average silhouette width
# ++++++++++++++++++++++++++
# Cluster package required
# d: dist object
# cluster: cluster number of observation
.get_ave_sil_width <- function(d, cluster){
  if (!requireNamespace("cluster", quietly = TRUE)) {
    stop("cluster package needed for this function to work. Please install it.")
  }
  ss <- cluster::silhouette(cluster, d)
  mean(ss[, 3])
}

# Get total within sum of square
# +++++++++++++++++++++++++++++
# d: dist object
# cluster: cluster number of observation
.get_withinSS <- function(d, cluster){
  d <- stats::as.dist(d)
  cn <- max(cluster)
  clusterf <- as.factor(cluster)
  clusterl <- levels(clusterf)
  cnn <- length(clusterl)
  
  if (cn != cnn) {
    warning("cluster renumbered because maximum != number of clusters")
    for (i in 1:cnn) cluster[clusterf == clusterl[i]] <- i
    cn <- cnn
  }
  cwn <- cn
  # Compute total within sum of square
  dmat <- as.matrix(d)
  within.cluster.ss <- 0
  for (i in 1:cn) {
    cluster.size <- sum(cluster == i)
    di <- as.dist(dmat[cluster == i, cluster == i])
    within.cluster.ss <- within.cluster.ss + sum(di^2)/cluster.size
  }
  within.cluster.ss
}

2 k-Means Clustering for Simple Data

2.1 Get Data

# Get Data
df <- data.frame(X = c(1, 3, 4, 5, 1, 4, 1, 2),
                 Y = c(3, 3, 3, 3, 2, 2, 1, 1),
         row.names = c("a", "b", "c", "d", "e", "f", "g", "h")           )

# Write Data to WD
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
write.csv(df, file = "Simple.csv")

# Scale Data - No Need Since Data Have Similar Dimension
# numeric_cols <- sapply(Simple, is.numeric)
# Simple[, numeric_cols] <- scale(Simple[, numeric_cols])

2.2 Explore Data for df

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(df)
## [1] "X" "Y"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(df)
## 'data.frame':    8 obs. of  2 variables:
##  $ X: num  1 3 4 5 1 4 1 2
##  $ Y: num  3 3 3 3 2 2 1 1
cat("\n\nAll Rows of Data Frame:\n")
## 
## 
## All Rows of Data Frame:
head(df, 8)
##   X Y
## a 1 3
## b 3 3
## c 4 3
## d 5 3
## e 1 2
## f 4 2
## g 1 1
## h 2 1
cat("\n\nDescriptive Statistics of Columns in OriginalData Frame:\n")
## 
## 
## Descriptive Statistics of Columns in OriginalData Frame:
st(df, add.median = TRUE, out = "csv", col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##   Variable N  Mean     SD Min  P25 P50 P75 Max NA Mode
## 1        X 8 2.625  1.598   1    1 2.5   4   5  0     
## 2        Y 8  2.25 0.8864   1 1.75 2.5   3   3  0

2.3 Plot Data

# Loading other packages if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)

# Color selection
Colors <- c("orange", "green", "red", "blue", "yellow", "pink", "lightgreen", "lightblue") 

# Show Data Frame
cat("\n\nShow df Data:\n")
## 
## 
## Show df Data:
plot(df$X, df$Y, type = "p", 
     main = "Dot Plot of df Data",
     col = "#7c0c18", 
     pch = 19,
     cex = 2.5,
     xlab = "X", 
     ylab = "Y",
     xlim = c(0.5, 5.5),
     ylim = c(0.5, 3.5))
grid(nx = NULL, ny = NULL,
     lty = 2,              # Grid line type
     col = "lightgrey",    # Grid line color
     lwd = 1)              # Grid line width
# legend("topleft", fill = Colors, legend = c("a", "b", "c", "d", "e", "f", "g", "h"), horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")
text(df$X, df$Y, row.names(df), cex = 1.6, pos = 1, col = "#7c0c18")

2.4 Estimating the Optimal Number of Clusters

# Plot Graph of Cluster Number vs SSq
fviz_nbclust(df, kmeans, method = "wss", linecolor = "red", k.max = 6
    ) +
#  geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0)
    ) +
labs(title = "Number of Clusters and Respective Sum of Squares",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain",
       col = Colors, pch = 19,
       x = "Number of Clusters", 
       y = "Total Within Sum of Squares"
     )

2.5 Build Clusters

# Calculate Cluster Data 
km2 <- kmeans(df, centers = 2, nstart = 25)
km3 <- kmeans(df, centers = 3, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:
print(km2)
## K-means clustering with 2 clusters of sizes 4, 4
## 
## Cluster means:
##      X    Y
## 1 4.00 2.75
## 2 1.25 1.75
## 
## Clustering vector:
## a b c d e f g h 
## 2 1 1 1 2 1 2 2 
## 
## Within cluster sum of squares by cluster:
## [1] 2.75 3.50
##  (between_SS / total_SS =  73.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(df, Cluster = km2$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##   X Y Cluster
## b 3 3       1
## c 4 3       1
## d 5 3       1
## f 4 2       1
## 
## 
## Cluster 2 :
##   X Y Cluster
## a 1 3       2
## e 1 2       2
## g 1 1       2
## h 2 1       2
# List Means with Original Data
aggregate(df, by = list(cluster = km2$cluster), mean)
##   cluster    X    Y
## 1       1 4.00 2.75
## 2       2 1.25 1.75

2.6 Visualise Clusters with 2 Centres

# Visualize k-means clusters
fviz_cluster(km2, data = df, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07", "green"), 
             ellipse.type = "euclid",       # Concentration ellipse (can be euclid, norm)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
             legend.title = "Clusters",
             stand = FALSE
) +
theme(
    panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 15),
    plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain"
) 

2.7 Visualise Clusters with 3 Centres

# Visualize k-means clusters
fviz_cluster(km3, data = df, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07", "green"), 
             ellipse.type = "euclid",       # Concentration ellipse (can be euclid, norm)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
             legend.title = "Clusters",
             stand = FALSE
) +
theme(
    panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 15),
    plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain"
) 

2.8 Check Whether Clusters Are Valid

# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)

# Standardize
df <- scale(df)

# K-means clustering
km.res <- eclust(df, "kmeans", k = 2, nstart = 25, graph = FALSE)

# Hierarchical clustering
hc.res <- eclust(df, "hclust", k = 2, hc_metric = "euclidean",
                 hc_method = "ward.D2", graph = FALSE)

# Silhouette information
silinfo <- km.res$silinfo
names(silinfo)
## [1] "widths"          "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
##   cluster neighbor sil_width
## g       1        2    0.5172
## e       1        2    0.4436
## h       1        2    0.4051
## a       1        2    0.0240
## c       2        1    0.6689
## d       2        1    0.6345
## b       2        1    0.4626
## f       2        1    0.3775

2.9 Plot Cluster Silhouette Width

# Visualise Silhouettes
fviz_silhouette(km.res, data = df, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07"), 
             ellipse.type = "t",    # Concentration ellipse (can be euclid)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
             legend.title = "Clusters",
             stand = FALSE
) +
theme(
    panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 15),
    plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain"
) 
##   cluster size ave.sil.width
## 1       1    4          0.35
## 2       2    4          0.54

2.10 Check Whether Clusters Are Valid with R-square and Pseudo-F

# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)

# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised. 
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
# Run K-means for Clustering
KM2 <- KMEANS(
   df,
   k = 4,
   criterion = "pseudo-F",
   graph = TRUE,
   main = "Number of Clusters vs pseudo-F and R-Square"
) 

2.11 Check Whether Clusters Are Valid with R-square and Pseudo-F

# Load Library if not available
if(! "clusterSim" %in% installed.packages()) { install.packages("clusterSim", dependencies = TRUE) }
library(clusterSim)

# Perform Clustering
km2 <- kmeans(df, centers = 2, nstart = 25)
km3 <- kmeans(df, centers = 3, nstart = 25)

# Calculate Pseudo-F of 2 Clusters
NumClus <- 2
NumObs  <- nrow(df)

pseudo_F <- pseudoF(km2)
p_value  <- pf(q = pseudo_F, df1 = NumClus - 1, df2 = NumObs - NumClus, lower.tail = FALSE)
Rsquare  <- (km2$totss - km2$tot.withinss) / km2$totss

cat("\nK-means Clustering Overview for 2 Clusters")
## 
## K-means Clustering Overview for 2 Clusters
cat("\nPseudo-F for 2 Clusters is: ", pseudo_F)
## 
## Pseudo-F for 2 Clusters is:  9.186
cat("\nP-Value for pseudo-F for 2 Clusters is: ", p_value)
## 
## P-Value for pseudo-F for 2 Clusters is:  0.02307
cat("\nR-square for 2 Clusters: ", Rsquare)
## 
## R-square for 2 Clusters:  0.6049
# Calculate Pseudo-F of 3 Clusters
NumClus <- 3
NumObs  <- nrow(df)

pseudo_F <- pseudoF(km3)
p_value  <- pf(q = pseudo_F, df1 = NumClus - 1, df2 = NumObs - NumClus, lower.tail = FALSE)
Rsquare  <- (km3$totss - km3$tot.withinss) / km3$totss

cat("\n\nK-means Clustering Overview for 3 Clusters")
## 
## 
## K-means Clustering Overview for 3 Clusters
cat("\nPseudo-F for 3 Clusters is: ", pseudo_F)
## 
## Pseudo-F for 3 Clusters is:  11.12
cat("\nP-Value for pseudo-F for 3 Clusters is: ", p_value)
## 
## P-Value for pseudo-F for 3 Clusters is:  0.01444
cat("\nR-square for 3 Clusters: ", Rsquare)
## 
## R-square for 3 Clusters:  0.8164
cat("\n")

3 k-Means Clustering for IRIS Data

3.1 Get Data

# Download Training Data from URL

3.2 Explore Data for Iris

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(Iris)
## [1] "SepalLength" "SepalWidth"  "PetalLength" "PetalWidth"  "Species"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(Iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ SepalLength: num  -0.898 -1.139 -1.381 -1.501 -1.018 ...
##  $ SepalWidth : num  1.0156 -0.1315 0.3273 0.0979 1.245 ...
##  $ PetalLength: num  -1.34 -1.34 -1.39 -1.28 -1.34 ...
##  $ PetalWidth : num  -1.31 -1.31 -1.31 -1.31 -1.31 ...
##  $ Species    : Factor w/ 3 levels "Setosa","Versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(Iris, 5)
##   SepalLength SepalWidth PetalLength PetalWidth Species
## 1     -0.8977    1.01560      -1.336     -1.311  Setosa
## 2     -1.1392   -0.13154      -1.336     -1.311  Setosa
## 3     -1.3807    0.32732      -1.392     -1.311  Setosa
## 4     -1.5015    0.09789      -1.279     -1.311  Setosa
## 5     -1.0184    1.24503      -1.336     -1.311  Setosa
cat("\n\nDescriptive Statistics of Columns in OriginalData Frame:\n")
## 
## 
## Descriptive Statistics of Columns in OriginalData Frame:
st(IrisRaw, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##         Variable   N   Mean     SD Min P25  P50 P75 Max NA Mode
## 1    SepalLength 150  5.843 0.8281 4.3 5.1  5.8 6.4 7.9  0     
## 2     SepalWidth 150  3.057 0.4359   2 2.8    3 3.3 4.4  0     
## 3    PetalLength 150  3.758  1.765   1 1.6 4.35 5.1 6.9  0     
## 4     PetalWidth 150  1.199 0.7622 0.1 0.3  1.3 1.8 2.5  0     
## 5        Species 150                                           
## 6     ... Setosa  50 33.33%                                    
## 7 ... Versicolor  50 33.33%                                    
## 8  ... Virginica  50 33.33%
cat("\n\nDescriptive Statistics of Columns in Scaled Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Scaled Data Frame:
st(Iris, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##         Variable   N                    Mean SD    Min     P25      P50    P75
## 1    SepalLength 150  -0.0000000000000004484  1 -1.864 -0.8977 -0.05233 0.6722
## 2     SepalWidth 150   0.0000000000000002034  1 -2.426 -0.5904  -0.1315 0.5567
## 3    PetalLength 150 -0.00000000000000002895  1 -1.562  -1.222   0.3354 0.7602
## 4     PetalWidth 150 -0.00000000000000003663  1 -1.442   -1.18   0.1321  0.788
## 5        Species 150                                                          
## 6     ... Setosa  50                  33.33%                                  
## 7 ... Versicolor  50                  33.33%                                  
## 8  ... Virginica  50                  33.33%                                  
##     Max NA Mode
## 1 2.484  0     
## 2  3.08  0     
## 3  1.78  0     
## 4 1.706  0     
## 5              
## 6              
## 7              
## 8

3.3 Plot Data

# Loading other packages if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)

# Color selection
Colors <- c("orange",      
            "green", 
            "red") 

# Show Characteristics of Data Frame
cat("\n\nShow Petal Length vs Petal Width:\n")
## 
## 
## Show Petal Length vs Petal Width:
plot(Iris$SepalLength, Iris$SepalWidth, type = "p", 
     main = "Dot Plot of Iris Data",
     sub = "Based on Public Domain Data",
     caption = "Data source: Public Domain",
     col = Colors[factor(Iris$Species)], 
     pch = 19,
     xlab = "Petal Length", 
     ylab = "Petal Width")
grid(nx = NULL, ny = NULL,
     lty = 2,              # Grid line type
     col = "lightgrey",    # Grid line color
     lwd = 1)              # Grid line width
legend("topleft", fill = Colors, legend = c("Setosa", "Versicolor", "Virginica"), 
     horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")

3.4 Estimating the Optimal Number of Clusters

# Plot Graph of Cluster Number vs SSq
fviz_nbclust(Iris[, c(1:4)], kmeans, method = "wss", linecolor = "red"
    ) +
#  geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0)
    ) +
labs(title = "Number of Clusters and Respective Sum of Squares",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain",
       col = Colors[factor(Iris$Species)], pch = 19,
       x = "Number of Clusters", 
       y = "Total Within Sum of Squares"
     )

3.5 Compute Cluster Indicators

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Calculate Correlations 
dist.cor <- get_dist(Iris[, c(1:4)], method = "pearson")

# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
##   1 2 3   4 5   6 7 8
## 1 0 0 0 0.0 0 0.0 0 0
## 2 0 0 0 0.0 0 0.0 0 0
## 3 0 0 0 0.0 0 0.0 0 0
## 4 0 0 0 0.0 0 0.1 0 0
## 5 0 0 0 0.0 0 0.0 0 0
## 6 0 0 0 0.1 0 0.0 0 0
## 7 0 0 0 0.0 0 0.0 0 0
## 8 0 0 0 0.0 0 0.0 0 0
# Display Graph
fviz_dist(dist.cor)

3.6 Compute Cluster Details for 4 Clusters

# Calculate Cluster Data 
km5 <- kmeans(Iris[, c(1:4)], centers = 5, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:
print(km5)
## K-means clustering with 5 clusters of sizes 28, 22, 23, 48, 29
## 
## Cluster means:
##   SepalLength SepalWidth PetalLength PetalWidth
## 1     -0.7467     1.4253     -1.2933   -1.21734
## 2     -1.3478     0.1187     -1.3100   -1.29316
## 3     -0.3516    -1.3286      0.1026    0.01228
## 4      0.3804    -0.3896      0.6068    0.56391
## 5      1.3927     0.2324      1.1567    1.21328
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   2   2   2   1   1   2   1   2   2   1   2   2   2   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   2   2   2   1   1   1   2   2   1   1   1   2   2   1   1   2   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   2   2   1   1   2   1   2   1   2   5   4   5   3   4   4   4   3   4   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   4   3   4   3   4   4   3   3   3   4   4   4   4   4   4   4   4   4   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   3   3   3   4   4   4   4   3   4   3   3   4   3   3   3   4   4   4   3   3 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   5   4   5   4   5   5   3   5   4   5   5   4   5   4   4   5   4   5   5   3 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   5   4   5   4   5   5   4   4   4   5   5   5   4   4   4   5   5   4   4   5 
## 141 142 143 144 145 146 147 148 149 150 
##   5   5   4   5   5   5   4   4   5   4 
## 
## Within cluster sum of squares by cluster:
## [1] 13.762  8.033 13.687 27.830 26.891
##  (between_SS / total_SS =  84.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(IrisRaw[,1:4], Cluster = km5$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##    SepalLength SepalWidth PetalLength PetalWidth Cluster
## 1          5.1        3.5         1.4        0.2       1
## 5          5.0        3.6         1.4        0.2       1
## 6          5.4        3.9         1.7        0.4       1
## 8          5.0        3.4         1.5        0.2       1
## 11         5.4        3.7         1.5        0.2       1
## 15         5.8        4.0         1.2        0.2       1
## 16         5.7        4.4         1.5        0.4       1
## 17         5.4        3.9         1.3        0.4       1
## 18         5.1        3.5         1.4        0.3       1
## 19         5.7        3.8         1.7        0.3       1
## 20         5.1        3.8         1.5        0.3       1
## 21         5.4        3.4         1.7        0.2       1
## 22         5.1        3.7         1.5        0.4       1
## 23         4.6        3.6         1.0        0.2       1
## 27         5.0        3.4         1.6        0.4       1
## 28         5.2        3.5         1.5        0.2       1
## 29         5.2        3.4         1.4        0.2       1
## 32         5.4        3.4         1.5        0.4       1
## 33         5.2        4.1         1.5        0.1       1
## 34         5.5        4.2         1.4        0.2       1
## 37         5.5        3.5         1.3        0.2       1
## 38         4.9        3.6         1.4        0.1       1
## 40         5.1        3.4         1.5        0.2       1
## 41         5.0        3.5         1.3        0.3       1
## 44         5.0        3.5         1.6        0.6       1
## 45         5.1        3.8         1.9        0.4       1
## 47         5.1        3.8         1.6        0.2       1
## 49         5.3        3.7         1.5        0.2       1
## 
## 
## Cluster 2 :
##    SepalLength SepalWidth PetalLength PetalWidth Cluster
## 2          4.9        3.0         1.4        0.2       2
## 3          4.7        3.2         1.3        0.2       2
## 4          4.6        3.1         1.5        0.2       2
## 7          4.6        3.4         1.4        0.3       2
## 9          4.4        2.9         1.4        0.2       2
## 10         4.9        3.1         1.5        0.1       2
## 12         4.8        3.4         1.6        0.2       2
## 13         4.8        3.0         1.4        0.1       2
## 14         4.3        3.0         1.1        0.1       2
## 24         5.1        3.3         1.7        0.5       2
## 25         4.8        3.4         1.9        0.2       2
## 26         5.0        3.0         1.6        0.2       2
## 30         4.7        3.2         1.6        0.2       2
## 31         4.8        3.1         1.6        0.2       2
## 35         4.9        3.1         1.5        0.2       2
## 36         5.0        3.2         1.2        0.2       2
## 39         4.4        3.0         1.3        0.2       2
## 42         4.5        2.3         1.3        0.3       2
## 43         4.4        3.2         1.3        0.2       2
## 46         4.8        3.0         1.4        0.3       2
## 48         4.6        3.2         1.4        0.2       2
## 50         5.0        3.3         1.4        0.2       2
## 
## 
## Cluster 3 :
##     SepalLength SepalWidth PetalLength PetalWidth Cluster
## 54          5.5        2.3         4.0        1.3       3
## 58          4.9        2.4         3.3        1.0       3
## 60          5.2        2.7         3.9        1.4       3
## 61          5.0        2.0         3.5        1.0       3
## 63          6.0        2.2         4.0        1.0       3
## 65          5.6        2.9         3.6        1.3       3
## 68          5.8        2.7         4.1        1.0       3
## 69          6.2        2.2         4.5        1.5       3
## 70          5.6        2.5         3.9        1.1       3
## 80          5.7        2.6         3.5        1.0       3
## 81          5.5        2.4         3.8        1.1       3
## 82          5.5        2.4         3.7        1.0       3
## 83          5.8        2.7         3.9        1.2       3
## 88          6.3        2.3         4.4        1.3       3
## 90          5.5        2.5         4.0        1.3       3
## 91          5.5        2.6         4.4        1.2       3
## 93          5.8        2.6         4.0        1.2       3
## 94          5.0        2.3         3.3        1.0       3
## 95          5.6        2.7         4.2        1.3       3
## 99          5.1        2.5         3.0        1.1       3
## 100         5.7        2.8         4.1        1.3       3
## 107         4.9        2.5         4.5        1.7       3
## 120         6.0        2.2         5.0        1.5       3
## 
## 
## Cluster 4 :
##     SepalLength SepalWidth PetalLength PetalWidth Cluster
## 52          6.4        3.2         4.5        1.5       4
## 55          6.5        2.8         4.6        1.5       4
## 56          5.7        2.8         4.5        1.3       4
## 57          6.3        3.3         4.7        1.6       4
## 59          6.6        2.9         4.6        1.3       4
## 62          5.9        3.0         4.2        1.5       4
## 64          6.1        2.9         4.7        1.4       4
## 66          6.7        3.1         4.4        1.4       4
## 67          5.6        3.0         4.5        1.5       4
## 71          5.9        3.2         4.8        1.8       4
## 72          6.1        2.8         4.0        1.3       4
## 73          6.3        2.5         4.9        1.5       4
## 74          6.1        2.8         4.7        1.2       4
## 75          6.4        2.9         4.3        1.3       4
## 76          6.6        3.0         4.4        1.4       4
## 77          6.8        2.8         4.8        1.4       4
## 78          6.7        3.0         5.0        1.7       4
## 79          6.0        2.9         4.5        1.5       4
## 84          6.0        2.7         5.1        1.6       4
## 85          5.4        3.0         4.5        1.5       4
## 86          6.0        3.4         4.5        1.6       4
## 87          6.7        3.1         4.7        1.5       4
## 89          5.6        3.0         4.1        1.3       4
## 92          6.1        3.0         4.6        1.4       4
## 96          5.7        3.0         4.2        1.2       4
## 97          5.7        2.9         4.2        1.3       4
## 98          6.2        2.9         4.3        1.3       4
## 102         5.8        2.7         5.1        1.9       4
## 104         6.3        2.9         5.6        1.8       4
## 109         6.7        2.5         5.8        1.8       4
## 112         6.4        2.7         5.3        1.9       4
## 114         5.7        2.5         5.0        2.0       4
## 115         5.8        2.8         5.1        2.4       4
## 117         6.5        3.0         5.5        1.8       4
## 122         5.6        2.8         4.9        2.0       4
## 124         6.3        2.7         4.9        1.8       4
## 127         6.2        2.8         4.8        1.8       4
## 128         6.1        3.0         4.9        1.8       4
## 129         6.4        2.8         5.6        2.1       4
## 133         6.4        2.8         5.6        2.2       4
## 134         6.3        2.8         5.1        1.5       4
## 135         6.1        2.6         5.6        1.4       4
## 138         6.4        3.1         5.5        1.8       4
## 139         6.0        3.0         4.8        1.8       4
## 143         5.8        2.7         5.1        1.9       4
## 147         6.3        2.5         5.0        1.9       4
## 148         6.5        3.0         5.2        2.0       4
## 150         5.9        3.0         5.1        1.8       4
## 
## 
## Cluster 5 :
##     SepalLength SepalWidth PetalLength PetalWidth Cluster
## 51          7.0        3.2         4.7        1.4       5
## 53          6.9        3.1         4.9        1.5       5
## 101         6.3        3.3         6.0        2.5       5
## 103         7.1        3.0         5.9        2.1       5
## 105         6.5        3.0         5.8        2.2       5
## 106         7.6        3.0         6.6        2.1       5
## 108         7.3        2.9         6.3        1.8       5
## 110         7.2        3.6         6.1        2.5       5
## 111         6.5        3.2         5.1        2.0       5
## 113         6.8        3.0         5.5        2.1       5
## 116         6.4        3.2         5.3        2.3       5
## 118         7.7        3.8         6.7        2.2       5
## 119         7.7        2.6         6.9        2.3       5
## 121         6.9        3.2         5.7        2.3       5
## 123         7.7        2.8         6.7        2.0       5
## 125         6.7        3.3         5.7        2.1       5
## 126         7.2        3.2         6.0        1.8       5
## 130         7.2        3.0         5.8        1.6       5
## 131         7.4        2.8         6.1        1.9       5
## 132         7.9        3.8         6.4        2.0       5
## 136         7.7        3.0         6.1        2.3       5
## 137         6.3        3.4         5.6        2.4       5
## 140         6.9        3.1         5.4        2.1       5
## 141         6.7        3.1         5.6        2.4       5
## 142         6.9        3.1         5.1        2.3       5
## 144         6.8        3.2         5.9        2.3       5
## 145         6.7        3.3         5.7        2.5       5
## 146         6.7        3.0         5.2        2.3       5
## 149         6.2        3.4         5.4        2.3       5
# List Means with Original Data
aggregate(IrisRaw[,1:4], by = list(cluster = km5$cluster), mean)
##   cluster SepalLength SepalWidth PetalLength PetalWidth
## 1       1       5.225      3.679       1.475     0.2714
## 2       2       4.727      3.109       1.445     0.2136
## 3       3       5.552      2.478       3.939     1.2087
## 4       4       6.158      2.888       4.829     1.6292
## 5       5       6.997      3.159       5.800     2.1241

3.7 Determining Optimal Number of Clusters

3.7.1 Defining Number of Clusters with Sum of Squares

# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)

# Elbow method
fviz_nbclust(Iris[, c(1:4)], kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0)
    ) +
  labs(title = "Elbow Method",
       subtitle = "Based on IRIS Data",
       caption = "Data source: Public Domain",
       col = Colors[factor(Iris$Species)], pch = 19,
       x = "Number of Clusters", 
       y = "Total Within Sum of Squares"
     )

3.7.2 Defining Number of Clusters with Silhouette Analysis

# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)

# Silhouette Analysis
fviz_nbclust(Iris[, 1:4], kmeans, method = "silhouette") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0)
    ) +
  labs(title = "Silhouette Method",
       subtitle = "Silhouette method - IRIS Data",
       caption = "Data source: Public Domain",
       col = Colors[factor(Iris$Species)], pch = 19,
       x = "Number of Clusters", 
       y = "Average Silhouette Width"
     )

3.7.3 Defining Number of Clusters with Gap Statistic

# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)

# Silhouette Analysis
set.seed(123)
fviz_nbclust(Iris[, 1:4], kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0)
    ) +
  labs(title = "Gap Statistic Method",
       subtitle = "Silhouette method - IRIS Data",
       caption = "Data source: Public Domain",
       col = Colors[factor(Iris$Species)], pch = 19,
       x = "Number of Clusters", 
       y = "Gap Statistic (k)"
     )

3.7.4 Determining the Optimal Number of Clusters with NbClust

# NbClust Analysis
nb <- NbClust(Iris[, 1:4], distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

3.7.5 Determining the Optimal Number of Clusters out of 30 Indicators

# Using NbClust Analysis
fviz_nbclust(nb)
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 11 proposed  2 as the best number of clusters
## * 10 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 1 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

3.8 Build Clusters

# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)

# K-means clustering
km2 <- eclust(Iris[, c(1:4)], "kmeans", k = 2, nstart = 25, graph = FALSE)
km3 <- eclust(Iris[, c(1:4)], "kmeans", k = 3, nstart = 25, graph = FALSE)
km4 <- eclust(Iris[, c(1:4)], "kmeans", k = 4, nstart = 25, graph = FALSE)

# Visualize k-means clusters
fviz_cluster(km2, data = Iris, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07"), 
             ellipse.type = "norm",    # Concentration ellipse (can be euclid)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain")

fviz_cluster(km3, data = Iris, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07", "green"), 
             ellipse.type = "norm",    # Concentration ellipse (can be euclid)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain")

fviz_cluster(km4, data = Iris, geom = "point", 
             palette = c("#2E9FDF", "#FC4E07", "green", "purple"), 
             ellipse.type = "norm",    # Concentration ellipse (can be euclid)
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain")

# Hierarchical clustering
hc.res <- eclust(Iris, "hclust", k = 3, hc_metric = "euclidean", hc_method = "ward.D2", graph = FALSE)
hc.res 
## 
## Call:
## stats::hclust(d = x, method = hc_method)
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 150
# Visualize dendrograms
# fviz_dend(hc.res, show_labels = FALSE, palette = "jco", as.ggplot = TRUE)

3.9 Cluster Validation with Silhouette

# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)

# Plot Cluster Silhouette
fviz_silhouette(km3, palette = "jco", ggtheme = theme_classic()) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
#   panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    axis.text.x = element_text(color = "black", size = 12),
) +
scale_x_discrete(breaks = seq(10, 150, by = 20)) +
labs(title = "Clusters Silhouette Plot of Iris Data",
       subtitle = "Based on Iris Data",
       caption = "Data source: Public Domain") 
##   cluster size ave.sil.width
## 1       1   50          0.64
## 2       2   53          0.39
## 3       3   47          0.35

# Silhouette information
silinfo <- km3$silinfo
names(silinfo)
## [1] "widths"          "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
##    cluster neighbor sil_width
## 1        1        2    0.7342
## 41       1        2    0.7333
## 8        1        2    0.7308
## 18       1        2    0.7288
## 5        1        2    0.7285
## 40       1        2    0.7247
## 38       1        2    0.7244
## 12       1        2    0.7218
## 28       1        2    0.7215
## 29       1        2    0.7145
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## [1] 0.6363 0.3934 0.3474
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## [1] 0.4599
# The size of each clusters
km3$size
## [1] 50 53 47
# Silhouette width of observation
sil <- km3$silinfo$widths[, 1:3]
cat("\n\nShow Silhouette With per Data Point. The higher the better:\n")
## 
## 
## Show Silhouette With per Data Point. The higher the better:
sil
##     cluster neighbor sil_width
## 1         1        2   0.73419
## 41        1        2   0.73333
## 8         1        2   0.73082
## 18        1        2   0.72875
## 5         1        2   0.72847
## 40        1        2   0.72470
## 38        1        2   0.72442
## 12        1        2   0.72179
## 28        1        2   0.72151
## 29        1        2   0.71452
## 50        1        2   0.71077
## 27        1        2   0.70471
## 25        1        2   0.70132
## 7         1        2   0.69838
## 23        1        2   0.69653
## 22        1        2   0.69021
## 49        1        2   0.68912
## 47        1        2   0.67882
## 3         1        2   0.67755
## 20        1        2   0.67746
## 36        1        2   0.67615
## 11        1        2   0.67418
## 30        1        2   0.66776
## 48        1        2   0.66606
## 37        1        2   0.66543
## 44        1        2   0.66372
## 21        1        2   0.66043
## 45        1        2   0.64864
## 32        1        2   0.64783
## 24        1        2   0.63738
## 43        1        2   0.63726
## 10        1        2   0.63154
## 35        1        2   0.62844
## 31        1        2   0.62562
## 17        1        2   0.62112
## 4         1        2   0.62050
## 6         1        2   0.60988
## 33        1        2   0.58967
## 19        1        2   0.58590
## 13        1        2   0.57847
## 2         1        2   0.56827
## 46        1        2   0.55935
## 15        1        2   0.55295
## 39        1        2   0.55262
## 14        1        2   0.54945
## 26        1        2   0.54495
## 34        1        2   0.54088
## 9         1        2   0.48821
## 16        1        2   0.45807
## 42        1        2   0.07767
## 90        2        3   0.58630
## 91        2        3   0.58417
## 95        2        3   0.58398
## 70        2        3   0.58259
## 93        2        3   0.58008
## 83        2        3   0.56859
## 81        2        3   0.56668
## 82        2        3   0.55698
## 80        2        3   0.55591
## 100       2        3   0.55292
## 68        2        3   0.54918
## 60        2        3   0.54390
## 54        2        3   0.54374
## 56        2        3   0.53879
## 97        2        3   0.50035
## 65        2        3   0.49518
## 107       2        3   0.47008
## 63        2        3   0.46947
## 72        2        3   0.45853
## 89        2        3   0.44045
## 94        2        1   0.43897
## 99        2        1   0.43311
## 96        2        3   0.43068
## 74        2        3   0.42610
## 61        2        1   0.42467
## 88        2        3   0.41830
## 120       2        3   0.41626
## 58        2        1   0.40978
## 85        2        3   0.40498
## 69        2        3   0.40351
## 67        2        3   0.39250
## 84        2        3   0.39024
## 114       2        3   0.37967
## 79        2        3   0.37448
## 73        2        3   0.36639
## 135       2        3   0.34505
## 102       2        3   0.34345
## 143       2        3   0.34345
## 62        2        3   0.34089
## 98        2        3   0.33949
## 64        2        3   0.33188
## 122       2        3   0.31781
## 92        2        3   0.24126
## 147       2        3   0.22992
## 75        2        3   0.21912
## 134       2        3   0.19544
## 124       2        3   0.17745
## 127       2        3   0.17430
## 55        2        3   0.13271
## 150       2        3   0.09788
## 139       2        3   0.08594
## 115       2        3   0.05779
## 59        2        3   0.03769
## 121       3        2   0.54976
## 144       3        2   0.54298
## 140       3        2   0.53818
## 125       3        2   0.52569
## 103       3        2   0.52208
## 126       3        2   0.51434
## 142       3        2   0.51143
## 141       3        2   0.50862
## 145       3        2   0.50445
## 113       3        2   0.50147
## 106       3        2   0.47219
## 136       3        2   0.46885
## 110       3        2   0.46638
## 146       3        2   0.46228
## 111       3        2   0.44616
## 116       3        2   0.44552
## 108       3        2   0.44361
## 105       3        2   0.43894
## 130       3        2   0.43638
## 101       3        2   0.42464
## 137       3        2   0.42243
## 131       3        2   0.41286
## 118       3        2   0.41132
## 123       3        2   0.40084
## 132       3        2   0.39874
## 149       3        2   0.38714
## 148       3        2   0.38121
## 138       3        2   0.36094
## 53        3        2   0.35855
## 117       3        2   0.35179
## 78        3        2   0.34222
## 51        3        2   0.34170
## 119       3        2   0.33087
## 87        3        2   0.27954
## 133       3        2   0.24284
## 57        3        2   0.23207
## 129       3        2   0.22254
## 66        3        2   0.18907
## 52        3        2   0.16628
## 104       3        2   0.15133
## 86        3        2   0.10973
## 76        3        2   0.05112
## 77        3        2   0.04064
## 71        3        2   0.03645
## 109       3        2   0.01676
## 112       3        2  -0.01058
## 128       3        2  -0.02489
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
##     cluster neighbor sil_width
## 112       3        2  -0.01058
## 128       3        2  -0.02489
# Statistics for k-means clustering
km_stats <- cluster.stats(dist(Iris), km3$cluster)

# Show Cluster Index
cat("\n\nShow Overview of Cluster Characteristics:\n")
## 
## 
## Show Overview of Cluster Characteristics:
cat("- Cluster Sizes ($cluster.size): The number of data points in each cluster.\n")
## - Cluster Sizes ($cluster.size): The number of data points in each cluster.
cat("- Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.\n")
## - Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.
cat("- Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.\n")
## - Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.
cat("- Average Distances Between Clusters ($average.between): The average distance between points in different clusters.\n")
## - Average Distances Between Clusters ($average.between): The average distance between points in different clusters.
cat("- Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.\n")
## - Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.
cat("- Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.\n")
## - Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.
cat("- Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.\n")
## - Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.
cat("- Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.\n")
## - Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.
cat("- Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.\n\n")
## - Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
km_stats
## $n
## [1] 150
## 
## $cluster.number
## [1] 3
## 
## $cluster.size
## [1] 50 53 47
## 
## $min.cluster.size
## [1] 47
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 5.628 3.267 3.738
## 
## $average.distance
## [1] 1.314 1.338 1.462
## 
## $median.distance
## [1] 1.105 1.292 1.385
## 
## $separation
## [1] 1.7367 0.1491 0.1491
## 
## $average.toother
## [1] 4.078 2.990 3.445
## 
## $separation.matrix
##       [,1]   [,2]   [,3]
## [1,] 0.000 1.7367 2.7001
## [2,] 1.737 0.0000 0.1491
## [3,] 2.700 0.1491 0.0000
## 
## $ave.between.matrix
##       [,1]  [,2]  [,3]
## [1,] 0.000 3.601 4.617
## [2,] 3.601 0.000 2.340
## [3,] 4.617 2.340 0.000
## 
## $average.between
## [1] 3.5
## 
## $average.within
## [1] 1.369
## 
## $n.between
## [1] 7491
## 
## $n.within
## [1] 3684
## 
## $max.diameter
## [1] 5.628
## 
## $min.separation
## [1] 0.1491
## 
## $within.cluster.ss
## [1] 173.6
## 
## $clus.avg.silwidths
##      1      2      3 
## 0.6363 0.3934 0.3474 
## 
## $avg.silwidth
## [1] 0.4599
## 
## $g2
## NULL
## 
## $g3
## NULL
## 
## $pearsongamma
## [1] 0.6797
## 
## $dunn
## [1] 0.0265
## 
## $dunn2
## [1] 1.6
## 
## $entropy
## [1] 1.097
## 
## $wb.ratio
## [1] 0.3911
## 
## $ch
## [1] 241.9
## 
## $cwidegap
## [1] 1.5532 0.8748 1.0546
## 
## $widestgap
## [1] 1.553
## 
## $sindex
## [1] 0.3941
## 
## $corrected.rand
## NULL
## 
## $vi
## NULL
# Show Cluster-Species-Relationship
cat("\n\nShow Cluster-Category-Membership:\n")
## 
## 
## Show Cluster-Category-Membership:
table(Iris$Species, km3$cluster)
##             
##               1  2  3
##   Setosa     50  0  0
##   Versicolor  0 39 11
##   Virginica   0 14 36

3.10 Check Whether Clusters Are Valid Using R-Square and Pseudo-F

# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)

# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised. 
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM2 <- KMEANS(
   Iris[, c(1:4)],
#  k = 2,
   criterion = "pseudo-F",
   graph = TRUE,
   nstart = 10,
   main = "Number of Clusters vs pseudo-F and R-Square"
) 

# Calculate Pseudo-F of Clusters
#cat("\nK-means Clustering Overview:\n\n")
#pseudoF(KM2)

3.11 Check Agreement between Species and PAM/HC Clusters

# Load Library if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Agreement between species and pam clusters
pam.res <- eclust(Iris[, c(1:4)], "pam", k = 4, graph = FALSE)

table(Iris$Species, km3$cluster)
##             
##               1  2  3
##   Setosa     50  0  0
##   Versicolor  0 39 11
##   Virginica   0 14 36
# cluster.stats(d = dist(Iris, Iris$Species, pam.res$cluster)$vi)

4 K-Means Clustering for US Arrest Data

4.1 Get Data

# Download Training Data from URL

4.2 Explore Data for USarrests

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(USarrests)
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(USarrests)
## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(USarrests, 5)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(USarrests, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##   Variable  N  Mean    SD Min   P25  P50   P75  Max NA Mode
## 1   Murder 50 7.788 4.356 0.8 4.075 7.25 11.25 17.4  0     
## 2  Assault 50 170.8 83.34  45   109  159   249  337  0     
## 3 UrbanPop 50 65.54 14.47  32  54.5   66 77.75   91  0     
## 4     Rape 50 21.23 9.366 7.3 15.08 20.1 26.17   46  0

4.3 Clean and Scale Data

# Omitting any NA values
USarrests <- na.omit(USarrests)

# Scale Data
numeric_cols <- sapply(USarrests, is.numeric)
USarrests[, numeric_cols] <- scale(USarrests[, numeric_cols])

# Write Data to WD
write.csv(USarrests, file = "USarrestsScaled.csv")

4.4 Estimating the Optimal Number of Clusters

# Plot Graph of Cluster Number vs SSq
fviz_nbclust(USarrests, kmeans, method = "wss", linecolor = "red") +
#  geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Number of Clusters and Respective Sum of Squares",
       subtitle = "Based on Crime Data from USA",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "Number of Clusters", 
       y = "Total Within Sum of Squares")

4.5 Compute Cluster Indicators

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Calculate Correlations 
dist.cor <- get_dist(USarrests, method = "pearson")

# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
##             Alabama Alaska Arizona Arkansas California Colorado Connecticut
## Alabama         0.0    0.7     1.4      0.1        1.9      1.7         1.7
## Alaska          0.7    0.0     0.8      0.4        0.8      0.5         1.9
## Arizona         1.4    0.8     0.0      1.2        0.3      0.6         0.8
## Arkansas        0.1    0.4     1.2      0.0        1.6      1.4         1.9
## California      1.9    0.8     0.3      1.6        0.0      0.1         0.7
## Colorado        1.7    0.5     0.6      1.4        0.1      0.0         1.0
## Connecticut     1.7    1.9     0.8      1.9        0.7      1.0         0.0
## Delaware        1.1    1.5     0.3      1.2        0.9      1.4         0.5
##             Delaware
## Alabama          1.1
## Alaska           1.5
## Arizona          0.3
## Arkansas         1.2
## California       0.9
## Colorado         1.4
## Connecticut      0.5
## Delaware         0.0
# Display Graph
fviz_dist(dist.cor)

4.6 Compute Cluster Details for 4 Clusters

# Calculate Cluster Data 
km4 <- kmeans(USarrests, centers = 4, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:
print(km4)
## K-means clustering with 4 clusters of sizes 16, 8, 13, 13
## 
## Cluster means:
##    Murder Assault UrbanPop     Rape
## 1 -0.4894 -0.3826   0.5758 -0.26165
## 2  1.4119  0.8743  -0.8145  0.01927
## 3 -0.9615 -1.1066  -0.9301 -0.96676
## 4  0.6951  1.0394   0.7226  1.27694
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              4              4              2              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              1              1              4              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              3              4              1              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              3              2              3              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              4              3              2              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              4              3              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              2              3              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              2              4              1              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              3              3              1 
## 
## Within cluster sum of squares by cluster:
## [1] 16.212  8.316 11.952 19.922
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(USarrestsRaw, Cluster = km4$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##               Murder Assault UrbanPop Rape Cluster
## Connecticut      3.3     110       77 11.1       1
## Delaware         5.9     238       72 15.8       1
## Hawaii           5.3      46       83 20.2       1
## Indiana          7.2     113       65 21.0       1
## Kansas           6.0     115       66 18.0       1
## Massachusetts    4.4     149       85 16.3       1
## New Jersey       7.4     159       89 18.8       1
## Ohio             7.3     120       75 21.4       1
## Oklahoma         6.6     151       68 20.0       1
## Oregon           4.9     159       67 29.3       1
## Pennsylvania     6.3     106       72 14.9       1
## Rhode Island     3.4     174       87  8.3       1
## Utah             3.2     120       80 22.9       1
## Virginia         8.5     156       63 20.7       1
## Washington       4.0     145       73 26.2       1
## Wyoming          6.8     161       60 15.6       1
## 
## 
## Cluster 2 :
##                Murder Assault UrbanPop Rape Cluster
## Alabama          13.2     236       58 21.2       2
## Arkansas          8.8     190       50 19.5       2
## Georgia          17.4     211       60 25.8       2
## Louisiana        15.4     249       66 22.2       2
## Mississippi      16.1     259       44 17.1       2
## North Carolina   13.0     337       45 16.1       2
## South Carolina   14.4     279       48 22.5       2
## Tennessee        13.2     188       59 26.9       2
## 
## 
## Cluster 3 :
##               Murder Assault UrbanPop Rape Cluster
## Idaho            2.6     120       54 14.2       3
## Iowa             2.2      56       57 11.3       3
## Kentucky         9.7     109       52 16.3       3
## Maine            2.1      83       51  7.8       3
## Minnesota        2.7      72       66 14.9       3
## Montana          6.0     109       53 16.4       3
## Nebraska         4.3     102       62 16.5       3
## New Hampshire    2.1      57       56  9.5       3
## North Dakota     0.8      45       44  7.3       3
## South Dakota     3.8      86       45 12.8       3
## Vermont          2.2      48       32 11.2       3
## West Virginia    5.7      81       39  9.3       3
## Wisconsin        2.6      53       66 10.8       3
## 
## 
## Cluster 4 :
##            Murder Assault UrbanPop Rape Cluster
## Alaska       10.0     263       48 44.5       4
## Arizona       8.1     294       80 31.0       4
## California    9.0     276       91 40.6       4
## Colorado      7.9     204       78 38.7       4
## Florida      15.4     335       80 31.9       4
## Illinois     10.4     249       83 24.0       4
## Maryland     11.3     300       67 27.8       4
## Michigan     12.1     255       74 35.1       4
## Missouri      9.0     178       70 28.2       4
## Nevada       12.2     252       81 46.0       4
## New Mexico   11.4     285       70 32.1       4
## New York     11.1     254       86 26.1       4
## Texas        12.7     201       80 25.5       4
# List Means with Original Data
aggregate(USarrestsRaw, by = list(cluster = km4$cluster), mean)
##   cluster Murder Assault UrbanPop  Rape
## 1       1  5.656  138.88    73.88 18.78
## 2       2 13.938  243.62    53.75 21.41
## 3       3  3.600   78.54    52.08 12.18
## 4       4 10.815  257.38    76.00 33.19

4.7 Perform k-Means Clustering

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# output to be present as PNG file 
# png(file = "KMeansCluster.png")
 
# Generate k-Means Cluster
km4 <- kmeans(USarrests, centers = 4, nstart = 25)
 
# Visualize the clusters
fviz_cluster(km4, data = USarrests)

# saving the file 
# dev.off()
 
# output to be present as PNG file 
# png(file = "KMeansCluster2.png")
 
# Generate k-Means Cluster
km5 <- kmeans(USarrests, centers = 5, nstart = 25)
 
# Visualize the clusters
fviz_cluster(km5, data = USarrests)

# saving the file 
# dev.off()

# output to be present as PNG file 
# png(file = "KMeansCluster3.png")
 
# Generate k-Means Cluster
km6 <- kmeans(USarrests, centers = 6, nstart = 25)
 
# Visualize the clusters
fviz_cluster(km6, data = USarrests)

# Visualize the clusters
fviz_cluster(km4, data = USarrests, 
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"), 
             ellipse.type = "euclid",  # Concentration ellipse
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Crime Data",
       subtitle = "Based on Crime Data from USA",
       caption = "Data source: Public Domain")

# saving the file 
# dev.off()

4.8 Check Whether Clusters Are Valid

# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)

# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised. 
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM4 <- KMEANS(
  USarrests,
  k = 8,
  criterion = "pseudo-F",
  graph = TRUE,
  nstart = 10
)

# Calculate Pseudo-F of Clusters
pseudoF(KM4)
## [1] 43.46
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:

4.9 Cluster Validation with Silhouette

# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)

# Generate k-Means Cluster
km3 <- kmeans(USarrests, centers = 3, nstart = 25)

# Silhouette information
silinfo <- km3$silinfo
names(silinfo)
## NULL
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## NULL
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## NULL
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## NULL
# The size of each clusters
km3$size
## [1] 17 20 13
# Silhouette width of observation
cat("\n\nShow Silhouette With per Data Point. The higher the better:\n")
## 
## 
## Show Silhouette With per Data Point. The higher the better:
sil <- km3$silinfo$widths[, 1:3]
sil
## NULL
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## NULL
# Statistics for k-means clustering
km_stats <- cluster.stats(dist(USarrests), km3$cluster)

# Show Cluster Index
cat("\n\nShow Overview of Cluster Characteristics:\n")
## 
## 
## Show Overview of Cluster Characteristics:
cat("- Cluster Sizes ($cluster.size): The number of data points in each cluster.\n")
## - Cluster Sizes ($cluster.size): The number of data points in each cluster.
cat("- Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.\n")
## - Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.
cat("- Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.\n")
## - Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.
cat("- Average Distances Between Clusters ($average.between): The average distance between points in different clusters.\n")
## - Average Distances Between Clusters ($average.between): The average distance between points in different clusters.
cat("- Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.\n")
## - Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.
cat("- Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.\n")
## - Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.
cat("- Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.\n")
## - Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.
cat("- Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.\n")
## - Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.
cat("- Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.\n\n")
## - Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
km_stats
## $n
## [1] 50
## 
## $cluster.number
## [1] 3
## 
## $cluster.size
## [1] 17 20 13
## 
## $min.cluster.size
## [1] 13
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 3.088 4.420 2.448
## 
## $average.distance
## [1] 1.470 2.067 1.314
## 
## $median.distance
## [1] 1.441 1.937 1.269
## 
## $separation
## [1] 0.5279 0.9787 0.5279
## 
## $average.toother
## [1] 2.599 3.322 3.142
## 
## $separation.matrix
##        [,1]   [,2]   [,3]
## [1,] 0.0000 0.9787 0.5279
## [2,] 0.9787 0.0000 1.7490
## [3,] 0.5279 1.7490 0.0000
## 
## $ave.between.matrix
##       [,1]  [,2]  [,3]
## [1,] 0.000 2.853 2.207
## [2,] 2.853 0.000 3.936
## [3,] 2.207 3.936 0.000
## 
## $average.between
## [1] 3.022
## 
## $average.within
## [1] 1.668
## 
## $n.between
## [1] 821
## 
## $n.within
## [1] 404
## 
## $max.diameter
## [1] 4.42
## 
## $min.separation
## [1] 0.5279
## 
## $within.cluster.ss
## [1] 78.32
## 
## $clus.avg.silwidths
##      1      2      3 
## 0.3179 0.2606 0.3735 
## 
## $avg.silwidth
## [1] 0.3094
## 
## $g2
## NULL
## 
## $g3
## NULL
## 
## $pearsongamma
## [1] 0.5416
## 
## $dunn
## [1] 0.1194
## 
## $dunn2
## [1] 1.068
## 
## $entropy
## [1] 1.084
## 
## $wb.ratio
## [1] 0.5521
## 
## $ch
## [1] 35.31
## 
## $cwidegap
## [1] 1.1803 2.0581 0.9825
## 
## $widestgap
## [1] 2.058
## 
## $sindex
## [1] 0.7068
## 
## $corrected.rand
## NULL
## 
## $vi
## NULL

5 K-Means Clustering for Uniform Data

5.1 Make 100 Uniform Data

# Set the seed for reproducibility
set.seed(5566)

# Generate 100 uniform random numbers between 0 and 1
UniformData <- runif(100, min = 0, max = 1)
UniformData <- as.data.frame(UniformData)
colnames(UniformData)[1]  <- "Uniform"

# Write Data to WD
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
write.csv(UniformData, file = "Uniform.csv")

5.2 Explore Data for USarrests

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(UniformData)
## [1] "Uniform"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(UniformData)
## 'data.frame':    100 obs. of  1 variable:
##  $ Uniform: num  0.417 0.724 0.968 0.725 0.729 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(UniformData, 5)
##   Uniform
## 1  0.4167
## 2  0.7241
## 3  0.9680
## 4  0.7250
## 5  0.7294
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(UniformData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##   Variable   N  Mean    SD      Min    P25    P50    P75    Max NA Mode
## 1  Uniform 100 0.512 0.284 0.004347 0.2916 0.5031 0.7347 0.9827  0

5.3 Estimating the Optimal Number of Clusters

# Plot Graph of Cluster Number vs SSq
fviz_nbclust(UniformData, kmeans, method = "wss", linecolor = "red") +
#  geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Number of Clusters and Respective Sum of Squares",
       subtitle = "Based on 100 Uniform Data",
       caption = "Data source: Generated",
       x = "Number of Clusters", 
       y = "Total Within Sum of Squares")

5.4 Compute Cluster Details for 2 Clusters

# Calculate Cluster Data 
km2 <- kmeans(UniformData, centers = 2, nstart = 25)
cat("\nK-means Clustering Overview for 2 Clusters:\n\n")
## 
## K-means Clustering Overview for 2 Clusters:
print(km2)
## K-means clustering with 2 clusters of sizes 53, 47
## 
## Cluster means:
##   Uniform
## 1  0.2878
## 2  0.7648
## 
## Clustering vector:
##   [1] 1 2 2 2 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 1 1 2 1
##  [38] 2 2 1 2 1 2 1 1 1 1 2 2 2 1 1 1 1 1 2 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1 1
##  [75] 2 2 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 1.3774 0.9396
##  (between_SS / total_SS =  71.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(UniformData, Cluster = km2$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##     Uniform Cluster
## 1  0.416749       1
## 6  0.176541       1
## 7  0.233097       1
## 8  0.466803       1
## 9  0.387857       1
## 12 0.506416       1
## 13 0.510548       1
## 14 0.137254       1
## 15 0.024193       1
## 16 0.004347       1
## 18 0.173520       1
## 20 0.489927       1
## 21 0.146855       1
## 23 0.234625       1
## 25 0.286042       1
## 27 0.414640       1
## 29 0.499882       1
## 34 0.218180       1
## 35 0.325591       1
## 37 0.046964       1
## 40 0.004625       1
## 42 0.119776       1
## 44 0.497205       1
## 45 0.170997       1
## 46 0.492589       1
## 47 0.092851       1
## 51 0.301067       1
## 52 0.308250       1
## 53 0.310212       1
## 54 0.420015       1
## 55 0.011431       1
## 60 0.029422       1
## 62 0.293446       1
## 69 0.416553       1
## 73 0.389575       1
## 74 0.051923       1
## 78 0.200345       1
## 79 0.463818       1
## 80 0.334846       1
## 81 0.043274       1
## 82 0.230071       1
## 83 0.487678       1
## 84 0.480661       1
## 85 0.469906       1
## 87 0.241851       1
## 90 0.148578       1
## 92 0.521560       1
## 93 0.280545       1
## 94 0.459477       1
## 95 0.299635       1
## 96 0.419346       1
## 98 0.236637       1
## 99 0.323830       1
## 
## 
## Cluster 2 :
##     Uniform Cluster
## 2    0.7241       2
## 3    0.9680       2
## 4    0.7250       2
## 5    0.7294       2
## 10   0.8927       2
## 11   0.7409       2
## 17   0.9827       2
## 19   0.6087       2
## 22   0.9818       2
## 24   0.9125       2
## 26   0.6264       2
## 28   0.7862       2
## 30   0.6711       2
## 31   0.9483       2
## 32   0.8962       2
## 33   0.8614       2
## 36   0.5518       2
## 38   0.5409       2
## 39   0.8117       2
## 41   0.7574       2
## 43   0.7701       2
## 48   0.8144       2
## 49   0.9365       2
## 50   0.5604       2
## 56   0.9754       2
## 57   0.7284       2
## 58   0.6278       2
## 59   0.6480       2
## 61   0.9814       2
## 63   0.7037       2
## 64   0.5481       2
## 65   0.6221       2
## 66   0.7032       2
## 67   0.9134       2
## 68   0.5635       2
## 70   0.7499       2
## 71   0.5850       2
## 72   0.6176       2
## 75   0.8789       2
## 76   0.7953       2
## 77   0.5288       2
## 86   0.8551       2
## 88   0.9509       2
## 89   0.9003       2
## 91   0.6523       2
## 97   0.7326       2
## 100  0.8864       2
# List Means with Original Data
cat("\n\nAverages per Cluster:\n")
## 
## 
## Averages per Cluster:
aggregate(UniformData, by = list(cluster = km2$cluster), mean)
##   cluster Uniform
## 1       1  0.2878
## 2       2  0.7648

6 K-Means Clustering for Mall Customers

6.1 Get Data

# Download Training Data from URL

6.2 Explore Data for Mall Customers

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(MallCust)
## [1] "Gender"        "Age"           "AnnualIncome"  "SpendingScore"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(MallCust)
## 'data.frame':    200 obs. of  4 variables:
##  $ Gender       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age          : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ AnnualIncome : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ SpendingScore: int  39 81 6 77 40 76 6 94 3 72 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(MallCust, 5)
##   Gender Age AnnualIncome SpendingScore
## 1   Male  19           15            39
## 2   Male  21           15            81
## 3 Female  20           16             6
## 4 Female  23           16            77
## 5 Female  31           17            40
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(MallCust, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##        Variable   N  Mean    SD Min   P25  P50 P75 Max NA Mode
## 1        Gender 200                                           
## 2    ... Female 112   56%                                     
## 3      ... Male  88   44%                                     
## 4           Age 200 38.85 13.97  18 28.75   36  49  70  0     
## 5  AnnualIncome 200 60.56 26.26  15  41.5 61.5  78 137  0     
## 6 SpendingScore 200  50.2 25.82   1 34.75   50  73  99  0
# Plot Income vs Spending Score by Gender
ggplot(MallCust, aes(x = AnnualIncome, y = SpendingScore, color = Gender)) +
    geom_point() +
    labs(title = "Income vs Spending Score by Gender", 
         subtitle = "Based on Public Domain Data ",
         caption = "Data source: Public Domain") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 22),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 12),
    plot.x = element_text(size = 0),
    legend.title = element_text(color = "black", size = 14), 
    legend.text = element_text(color = "black", size = 12),
    axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.title = element_text(size = 14, colour = "black"),
) 

6.3 Clean and Scale Data

# Omitting any NA values
MallCust <- na.omit(MallCust)

# Remove rows with NA, NaN, or Inf
MallCust <- MallCust[complete.cases(MallCust), ]

# Convert Gender column to numeric: Male -> 1, Female -> 0
MallCust$Gender <- ifelse(MallCust$Gender == "Male", 1, 0)

# Scale Data
numeric_cols <- sapply(MallCust, is.numeric)
MallCust[, numeric_cols] <- scale(MallCust[, numeric_cols])

# Write Data to WD
write.csv(MallCust, file = "MallCustScaled.csv")

6.4 Estimating the Optimal Number of Clusters

# Plot Graph of Cluster Number vs SSq with corrected theme and linecolor
fviz_nbclust(MallCust, kmeans, method = "wss", linecolor = "red") +
  geom_vline(xintercept = 5, size = 1, linetype = 2, color = "darkblue") +
  theme(
    panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    axis.title.x = element_text(size = 12)  # Corrected 'plot.x' to 'axis.title.x'
  ) +
  labs(
    title = "Number of Clusters and Respective Sum of Squares",
    subtitle = "Based on Mall Customer Data",
    caption = "Data source: Public Domain",
    x = "Number of Clusters", 
    y = "Total Within Sum of Squares"
  )

6.5 Compute Cluster Indicators

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Calculate Correlations 
dist.cor <- get_dist(MallCust, method = "pearson")

# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
##     1   2   3   4   5   6   7   8
## 1 0.0 0.1 0.2 0.7 0.7 0.7 0.9 0.7
## 2 0.1 0.0 0.6 0.2 0.4 0.2 1.2 0.3
## 3 0.2 0.6 0.0 1.3 0.9 1.3 0.4 1.3
## 4 0.7 0.2 1.3 0.0 0.3 0.0 1.4 0.0
## 5 0.7 0.4 0.9 0.3 0.0 0.3 0.7 0.3
## 6 0.7 0.2 1.3 0.0 0.3 0.0 1.4 0.0
## 7 0.9 1.2 0.4 1.4 0.7 1.4 0.0 1.5
## 8 0.7 0.3 1.3 0.0 0.3 0.0 1.5 0.0
# Display Graph
fviz_dist(dist.cor)

6.6 Compute Cluster Details for 4 Clusters

# Calculate Cluster Data 
km4 <- kmeans(MallCust, centers = 4, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:
print(km4)
## K-means clustering with 4 clusters of sizes 57, 55, 48, 40
## 
## Cluster means:
##    Gender     Age AnnualIncome SpendingScore
## 1 -0.8842 -0.7453     -0.03401        0.6771
## 2 -0.8842  0.6628     -0.06632       -0.5971
## 3  1.1253  0.7579      0.07069       -0.8129
## 4  1.1253 -0.7588      0.05483        0.8316
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   4   4   2   1   1   1   2   1   3   1   3   1   2   1   3   4   2   4   3   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   4   2   4   2   4   2   4   2   1   3   1   3   4   2   1   2   1   2   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   4   3   1   2   1   2   1   1   1   2   4   1   3   2   3   2   3   1   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   4   2   2   3   4   2   2   4   1   3   2   2   2   3   4   2   3   1   2 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   3   4   3   2   1   3   2   1   1   2   2   4   3   2   1   4   2   1   3   4 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   2   3   4   3   1   2   3   3   3   3   1   2   4   1   1   2   2   2   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   4   2   1   4   1   1   3   4   3   4   3   4   1   1   3   1   2   4   3   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   2   4   1   1   3   4   3   1   2   4   3   4   2   1   2   1   3   1   3   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   2   1   3   1   3   1   3   1   2   4   3   4   3   4   2   1   3   4   3   4 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   1   3   1   2   4   2   4   2   1   2   1   3   1   2   1   2   4   3   4 
## 
## Within cluster sum of squares by cluster:
## [1]  94.92 100.72 115.12  74.02
##  (between_SS / total_SS =  51.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(MallCustRaw, Cluster = km4$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 4   Female  23           16            77       1
## 5   Female  31           17            40       1
## 6   Female  22           17            76       1
## 8   Female  23           18            94       1
## 10  Female  30           19            72       1
## 12  Female  35           19            99       1
## 14  Female  24           20            77       1
## 20  Female  35           23            98       1
## 30  Female  23           29            87       1
## 32  Female  21           30            73       1
## 36  Female  21           33            81       1
## 38  Female  30           34            73       1
## 40  Female  20           37            75       1
## 44  Female  31           39            61       1
## 46  Female  24           39            65       1
## 48  Female  27           40            47       1
## 49  Female  29           40            42       1
## 50  Female  31           40            42       1
## 53  Female  31           43            54       1
## 59  Female  27           46            51       1
## 70  Female  32           48            47       1
## 79  Female  23           54            52       1
## 85  Female  21           54            57       1
## 88  Female  22           57            55       1
## 89  Female  34           58            60       1
## 95  Female  32           60            42       1
## 98  Female  27           60            50       1
## 101 Female  23           62            41       1
## 106 Female  21           62            42       1
## 112 Female  19           63            54       1
## 115 Female  18           65            48       1
## 116 Female  19           65            50       1
## 123 Female  40           69            58       1
## 125 Female  23           70            29       1
## 126 Female  31           70            77       1
## 133 Female  25           72            34       1
## 134 Female  31           72            71       1
## 136 Female  29           73            88       1
## 140 Female  35           74            72       1
## 143 Female  28           76            40       1
## 144 Female  32           76            87       1
## 148 Female  32           77            74       1
## 154 Female  38           78            76       1
## 156 Female  27           78            89       1
## 158 Female  30           78            78       1
## 160 Female  30           78            73       1
## 162 Female  29           79            83       1
## 164 Female  31           81            93       1
## 166 Female  36           85            75       1
## 168 Female  33           86            95       1
## 176 Female  30           88            86       1
## 182 Female  32           97            86       1
## 184 Female  29           98            88       1
## 190 Female  36          103            85       1
## 192 Female  32          103            69       1
## 194 Female  38          113            91       1
## 196 Female  35          120            79       1
## 
## 
## Cluster 2 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 3   Female  20           16             6       2
## 7   Female  35           18             6       2
## 13  Female  58           20            15       2
## 17  Female  35           21            35       2
## 23  Female  46           25             5       2
## 25  Female  54           28            14       2
## 27  Female  45           28            32       2
## 29  Female  40           29            31       2
## 35  Female  49           33            14       2
## 37  Female  42           34            17       2
## 39  Female  36           37            26       2
## 41  Female  65           38            35       2
## 45  Female  49           39            28       2
## 47  Female  50           40            55       2
## 51  Female  49           42            52       2
## 55  Female  50           43            45       2
## 57  Female  51           44            50       2
## 63  Female  67           47            52       2
## 64  Female  54           47            59       2
## 67  Female  43           48            50       2
## 68  Female  68           48            48       2
## 72  Female  47           49            42       2
## 73  Female  60           50            49       2
## 74  Female  60           50            56       2
## 77  Female  45           54            53       2
## 80  Female  49           54            42       2
## 84  Female  46           54            44       2
## 87  Female  55           57            58       2
## 90  Female  50           58            46       2
## 91  Female  68           59            55       2
## 94  Female  40           60            40       2
## 97  Female  47           60            47       2
## 102 Female  49           62            48       2
## 107 Female  66           63            50       2
## 113 Female  38           64            42       2
## 117 Female  63           65            43       2
## 118 Female  49           65            59       2
## 119 Female  51           67            43       2
## 120 Female  50           67            57       2
## 122 Female  38           67            40       2
## 137 Female  44           73             7       2
## 141 Female  57           75             5       2
## 149 Female  34           78            22       2
## 153 Female  44           78            20       2
## 155 Female  47           78            16       2
## 161 Female  56           79            35       2
## 169 Female  36           87            27       2
## 175 Female  52           88            13       2
## 181 Female  37           97            32       2
## 185 Female  41           99            39       2
## 187 Female  54          101            24       2
## 189 Female  41          103            17       2
## 191 Female  34          103            23       2
## 195 Female  47          120            16       2
## 197 Female  45          126            28       2
## 
## 
## Cluster 3 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 9     Male  64           19             3       3
## 11    Male  67           19            14       3
## 15    Male  37           20            13       3
## 19    Male  52           23            29       3
## 21    Male  35           24            35       3
## 31    Male  60           30             4       3
## 33    Male  53           33             4       3
## 43    Male  48           39            36       3
## 54    Male  59           43            60       3
## 56    Male  47           43            41       3
## 58    Male  69           44            46       3
## 60    Male  53           46            46       3
## 61    Male  70           46            56       3
## 65    Male  63           48            51       3
## 71    Male  70           49            55       3
## 75    Male  59           54            47       3
## 78    Male  40           54            48       3
## 81    Male  57           54            51       3
## 83    Male  67           54            41       3
## 86    Male  48           54            46       3
## 93    Male  48           60            49       3
## 99    Male  48           61            42       3
## 103   Male  67           62            59       3
## 105   Male  49           62            56       3
## 108   Male  54           63            46       3
## 109   Male  68           63            43       3
## 110   Male  66           63            48       3
## 111   Male  65           63            52       3
## 127   Male  43           71            35       3
## 129   Male  59           71            11       3
## 131   Male  47           71             9       3
## 135   Male  20           73             5       3
## 139   Male  19           74            10       3
## 145   Male  25           77            12       3
## 147   Male  48           77            36       3
## 151   Male  43           78            17       3
## 157   Male  37           78             1       3
## 159   Male  34           78             1       3
## 163   Male  19           81             5       3
## 165   Male  50           85            26       3
## 167   Male  42           86            20       3
## 171   Male  40           87            13       3
## 173   Male  36           87            10       3
## 177   Male  58           88            15       3
## 179   Male  59           93            14       3
## 183   Male  46           98            15       3
## 193   Male  33          113             8       3
## 199   Male  32          137            18       3
## 
## 
## Cluster 4 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 1     Male  19           15            39       4
## 2     Male  21           15            81       4
## 16    Male  22           20            79       4
## 18    Male  20           21            66       4
## 22    Male  25           24            73       4
## 24    Male  31           25            73       4
## 26    Male  29           28            82       4
## 28    Male  35           28            61       4
## 34    Male  18           33            92       4
## 42    Male  24           38            92       4
## 52    Male  33           42            60       4
## 62    Male  19           46            55       4
## 66    Male  18           48            59       4
## 69    Male  19           48            59       4
## 76    Male  26           54            54       4
## 82    Male  38           54            55       4
## 92    Male  18           59            41       4
## 96    Male  24           60            52       4
## 100   Male  20           61            49       4
## 104   Male  26           62            55       4
## 114   Male  19           64            46       4
## 121   Male  27           67            56       4
## 124   Male  39           69            91       4
## 128   Male  40           71            95       4
## 130   Male  38           71            75       4
## 132   Male  39           71            75       4
## 138   Male  32           73            73       4
## 142   Male  32           75            93       4
## 146   Male  28           77            97       4
## 150   Male  34           78            90       4
## 152   Male  39           78            88       4
## 170   Male  32           87            63       4
## 172   Male  28           87            75       4
## 174   Male  36           87            92       4
## 178   Male  27           88            69       4
## 180   Male  35           93            90       4
## 186   Male  30           99            97       4
## 188   Male  28          101            68       4
## 198   Male  32          126            74       4
## 200   Male  30          137            83       4
# List Means with Original Data
aggregate(MallCustRaw, by = list(cluster = km4$cluster), mean)
##   cluster Gender   Age AnnualIncome SpendingScore
## 1       1     NA 28.44        59.67         67.68
## 2       2     NA 48.11        58.82         34.78
## 3       3     NA 49.44        62.42         29.21
## 4       4     NA 28.25        62.00         71.67

6.7 Compute Cluster Details for 5 Clusters

# Calculate Cluster Data 
km5 <- kmeans(MallCust, centers = 5, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
## 
## K-means Clustering Overview:
print(km5)
## K-means clustering with 5 clusters of sizes 40, 56, 42, 31, 31
## 
## Cluster means:
##    Gender     Age AnnualIncome SpendingScore
## 1  1.1253 -0.7588     0.054826        0.8316
## 2 -0.8842 -0.7486    -0.005004        0.6962
## 3 -0.8842  0.7368    -0.541664       -0.4097
## 4  1.1253  1.2208    -0.448731       -0.4412
## 5  0.2178  0.1123     1.120895       -1.3344
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   3   2   3   2   3   2   4   2   4   2   3   2   4   1   3   1   4   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   4   1   3   1   3   1   3   1   3   2   4   2   4   1   3   2   3   2   3   2 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   1   4   2   3   2   3   2   2   2   3   1   2   4   3   4   3   4   2   4 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   4   1   3   3   4   1   3   3   1   2   4   3   3   3   4   1   3   4   2   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   4   1   4   3   2   4   3   2   2   3   3   1   4   3   2   1   3   2   4   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   3   4   1   4   2   3   4   4   4   4   2   3   1   2   2   3   3   3   3 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   3   2   1   2   2   4   1   4   1   5   1   2   2   5   2   5   1   5   2 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   5   1   2   2   5   1   4   2   5   1   5   1   5   2   5   2   5   2   5   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   3   2   5   2   5   2   5   2   5   1   5   1   5   1   5   2   5   1   5   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   5   2   5   2   5   1   5   1   5   2   5   2   5   2   5   2   5   1   5   1 
## 
## Within cluster sum of squares by cluster:
## [1] 74.02 91.03 53.56 40.64 64.21
##  (between_SS / total_SS =  59.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(MallCustRaw, Cluster = km5$cluster)

# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
  cat("\nCluster", i, ":\n")
  print(clustered_data[clustered_data$Cluster == i, ])
  cat("\n")
}
## 
## Cluster 1 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 1     Male  19           15            39       1
## 2     Male  21           15            81       1
## 16    Male  22           20            79       1
## 18    Male  20           21            66       1
## 22    Male  25           24            73       1
## 24    Male  31           25            73       1
## 26    Male  29           28            82       1
## 28    Male  35           28            61       1
## 34    Male  18           33            92       1
## 42    Male  24           38            92       1
## 52    Male  33           42            60       1
## 62    Male  19           46            55       1
## 66    Male  18           48            59       1
## 69    Male  19           48            59       1
## 76    Male  26           54            54       1
## 82    Male  38           54            55       1
## 92    Male  18           59            41       1
## 96    Male  24           60            52       1
## 100   Male  20           61            49       1
## 104   Male  26           62            55       1
## 114   Male  19           64            46       1
## 121   Male  27           67            56       1
## 124   Male  39           69            91       1
## 128   Male  40           71            95       1
## 130   Male  38           71            75       1
## 132   Male  39           71            75       1
## 138   Male  32           73            73       1
## 142   Male  32           75            93       1
## 146   Male  28           77            97       1
## 150   Male  34           78            90       1
## 152   Male  39           78            88       1
## 170   Male  32           87            63       1
## 172   Male  28           87            75       1
## 174   Male  36           87            92       1
## 178   Male  27           88            69       1
## 180   Male  35           93            90       1
## 186   Male  30           99            97       1
## 188   Male  28          101            68       1
## 198   Male  32          126            74       1
## 200   Male  30          137            83       1
## 
## 
## Cluster 2 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 4   Female  23           16            77       2
## 6   Female  22           17            76       2
## 8   Female  23           18            94       2
## 10  Female  30           19            72       2
## 12  Female  35           19            99       2
## 14  Female  24           20            77       2
## 20  Female  35           23            98       2
## 30  Female  23           29            87       2
## 32  Female  21           30            73       2
## 36  Female  21           33            81       2
## 38  Female  30           34            73       2
## 40  Female  20           37            75       2
## 44  Female  31           39            61       2
## 46  Female  24           39            65       2
## 48  Female  27           40            47       2
## 49  Female  29           40            42       2
## 50  Female  31           40            42       2
## 53  Female  31           43            54       2
## 59  Female  27           46            51       2
## 70  Female  32           48            47       2
## 79  Female  23           54            52       2
## 85  Female  21           54            57       2
## 88  Female  22           57            55       2
## 89  Female  34           58            60       2
## 95  Female  32           60            42       2
## 98  Female  27           60            50       2
## 101 Female  23           62            41       2
## 106 Female  21           62            42       2
## 112 Female  19           63            54       2
## 115 Female  18           65            48       2
## 116 Female  19           65            50       2
## 123 Female  40           69            58       2
## 125 Female  23           70            29       2
## 126 Female  31           70            77       2
## 133 Female  25           72            34       2
## 134 Female  31           72            71       2
## 136 Female  29           73            88       2
## 140 Female  35           74            72       2
## 143 Female  28           76            40       2
## 144 Female  32           76            87       2
## 148 Female  32           77            74       2
## 154 Female  38           78            76       2
## 156 Female  27           78            89       2
## 158 Female  30           78            78       2
## 160 Female  30           78            73       2
## 162 Female  29           79            83       2
## 164 Female  31           81            93       2
## 166 Female  36           85            75       2
## 168 Female  33           86            95       2
## 176 Female  30           88            86       2
## 182 Female  32           97            86       2
## 184 Female  29           98            88       2
## 190 Female  36          103            85       2
## 192 Female  32          103            69       2
## 194 Female  38          113            91       2
## 196 Female  35          120            79       2
## 
## 
## Cluster 3 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 3   Female  20           16             6       3
## 5   Female  31           17            40       3
## 7   Female  35           18             6       3
## 13  Female  58           20            15       3
## 17  Female  35           21            35       3
## 23  Female  46           25             5       3
## 25  Female  54           28            14       3
## 27  Female  45           28            32       3
## 29  Female  40           29            31       3
## 35  Female  49           33            14       3
## 37  Female  42           34            17       3
## 39  Female  36           37            26       3
## 41  Female  65           38            35       3
## 45  Female  49           39            28       3
## 47  Female  50           40            55       3
## 51  Female  49           42            52       3
## 55  Female  50           43            45       3
## 57  Female  51           44            50       3
## 63  Female  67           47            52       3
## 64  Female  54           47            59       3
## 67  Female  43           48            50       3
## 68  Female  68           48            48       3
## 72  Female  47           49            42       3
## 73  Female  60           50            49       3
## 74  Female  60           50            56       3
## 77  Female  45           54            53       3
## 80  Female  49           54            42       3
## 84  Female  46           54            44       3
## 87  Female  55           57            58       3
## 90  Female  50           58            46       3
## 91  Female  68           59            55       3
## 94  Female  40           60            40       3
## 97  Female  47           60            47       3
## 102 Female  49           62            48       3
## 107 Female  66           63            50       3
## 113 Female  38           64            42       3
## 117 Female  63           65            43       3
## 118 Female  49           65            59       3
## 119 Female  51           67            43       3
## 120 Female  50           67            57       3
## 122 Female  38           67            40       3
## 161 Female  56           79            35       3
## 
## 
## Cluster 4 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 9     Male  64           19             3       4
## 11    Male  67           19            14       4
## 15    Male  37           20            13       4
## 19    Male  52           23            29       4
## 21    Male  35           24            35       4
## 31    Male  60           30             4       4
## 33    Male  53           33             4       4
## 43    Male  48           39            36       4
## 54    Male  59           43            60       4
## 56    Male  47           43            41       4
## 58    Male  69           44            46       4
## 60    Male  53           46            46       4
## 61    Male  70           46            56       4
## 65    Male  63           48            51       4
## 71    Male  70           49            55       4
## 75    Male  59           54            47       4
## 78    Male  40           54            48       4
## 81    Male  57           54            51       4
## 83    Male  67           54            41       4
## 86    Male  48           54            46       4
## 93    Male  48           60            49       4
## 99    Male  48           61            42       4
## 103   Male  67           62            59       4
## 105   Male  49           62            56       4
## 108   Male  54           63            46       4
## 109   Male  68           63            43       4
## 110   Male  66           63            48       4
## 111   Male  65           63            52       4
## 127   Male  43           71            35       4
## 129   Male  59           71            11       4
## 147   Male  48           77            36       4
## 
## 
## Cluster 5 :
##     Gender Age AnnualIncome SpendingScore Cluster
## 131   Male  47           71             9       5
## 135   Male  20           73             5       5
## 137 Female  44           73             7       5
## 139   Male  19           74            10       5
## 141 Female  57           75             5       5
## 145   Male  25           77            12       5
## 149 Female  34           78            22       5
## 151   Male  43           78            17       5
## 153 Female  44           78            20       5
## 155 Female  47           78            16       5
## 157   Male  37           78             1       5
## 159   Male  34           78             1       5
## 163   Male  19           81             5       5
## 165   Male  50           85            26       5
## 167   Male  42           86            20       5
## 169 Female  36           87            27       5
## 171   Male  40           87            13       5
## 173   Male  36           87            10       5
## 175 Female  52           88            13       5
## 177   Male  58           88            15       5
## 179   Male  59           93            14       5
## 181 Female  37           97            32       5
## 183   Male  46           98            15       5
## 185 Female  41           99            39       5
## 187 Female  54          101            24       5
## 189 Female  41          103            17       5
## 191 Female  34          103            23       5
## 193   Male  33          113             8       5
## 195 Female  47          120            16       5
## 197 Female  45          126            28       5
## 199   Male  32          137            18       5
# List Means with Original Data
aggregate(MallCust, by = list(cluster = km5$cluster), mean)
##   cluster  Gender     Age AnnualIncome SpendingScore
## 1       1  1.1253 -0.7588     0.054826        0.8316
## 2       2 -0.8842 -0.7486    -0.005004        0.6962
## 3       3 -0.8842  0.7368    -0.541664       -0.4097
## 4       4  1.1253  1.2208    -0.448731       -0.4412
## 5       5  0.2178  0.1123     1.120895       -1.3344
aggregate(MallCustRaw, by = list(cluster = km5$cluster), mean)
##   cluster Gender   Age AnnualIncome SpendingScore
## 1       1     NA 28.25        62.00         71.67
## 2       2     NA 28.39        60.43         68.18
## 3       3     NA 49.14        46.33         39.62
## 4       4     NA 55.90        48.77         38.81
## 5       5     NA 40.42        90.00         15.74

6.8 Perform k-Means Clustering with k = 4

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Generate k-Means Cluster
km4 <- kmeans(MallCust, centers = 4, nstart = 25)
 
# Visualize the clusters
fviz_cluster(km4, data = MallCust)

# saving the file 
# dev.off()
 
# output to be present as PNG file 
png(file = "KMeansCluster4.png", width = 1000, height = 1000)
 
# Visualize the clusters
fviz_cluster(km4, data = MallCust, 
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"), 
             ellipse.type = "euclid",  # Concentration ellipse
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Mall Customer Data",
       subtitle = "Based on Public Domain Data",
       caption = "Data source: Public Domain")

# Add Cluster Number to MallCust
MallCust$Cluster4 <- km4$cluster
MallCustRaw$Cluster4 <- km4$cluster

# saving the file 
dev.off()
## png 
##   2
# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust4.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw4.csv"))

6.9 Perform k-Means Clustering with k = 5

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Visualize the clusters
p <- fviz_cluster(km5, data = MallCust, 
             palette = c("#2E9FDF", "#E7B800", "#FC4E07", "green", "brown"), 
             ellipse.type = "euclid",  # Concentration ellipse
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 22),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 12),
    plot.x = element_text(size = 0),
    legend.title = element_text(color = "black", size = 14), 
    legend.text = element_text(color = "black", size = 12),
    axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.title.x = element_text(size = 14, colour = "black"),
    axis.title.y = element_text(size = 14, colour = "black"),
    legend.position = "top"
) +
labs(title = "Mall Customer Segments",
     subtitle = "Clustered by Age, Gender, Annual Income and Spending",
     caption = "Data source: Public Domain"
     )

p

# Save the plot
ggsave(filename = paste("Mall Customer Segments", ".png", sep = ""), plot = p, width = 12, height = 9)

# Add Cluster Number to MallCust
MallCust$Segment <- km5$cluster
MallCustRaw$Segment <- km5$cluster

# saving the file 
# dev.off()

# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust5.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw5.csv"))

6.10 Understand Customer Segments

# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)

# Describe customer Segments
segment_profiles <- MallCustRaw %>%
    group_by(Segment) %>%
    summarise(
        Avg_Age = mean(Age),
        Avg_Income = mean(AnnualIncome),
        Avg_Spending_Score = mean(SpendingScore),
        Gender_Distribution = list(table(Gender))
    )

# Visualise Segments
p <- ggplot(MallCustRaw, aes(x = Segment, fill = Gender)) +
 geom_bar(position = "dodge") +
 theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 22),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 12),
    plot.x = element_text(size = 0),
    legend.title = element_text(color = "black", size = 14), 
    legend.text = element_text(color = "black", size = 12),
    axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
    axis.title.x = element_text(size = 14, colour = "black"),
    axis.title.y = element_text(size = 14, colour = "black"),
  ) +
  labs(title = "Customer Segments by Gender",
       subtitle = "Based on Public Domain Data ",
       caption = "Data source: Public Domain",
       x = "Customer Segment",
       y = "Customers",
       fill = "Segment") 
p

# Save the plot
ggsave(filename = paste("Customer Segments by Gender", ".png", sep = ""), plot = p, width = 12, height = 8) 

# Write Data to WD
# write.csv(segment_profiles, file = paste(today, "SegmentProfiles5.csv"))

6.11 Perform k-Means Clustering with k = 6

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# output to be present as PNG file 
# png(file = "KMeansCluster.png")
 
# Generate k-Means Cluster
km6 <- kmeans(MallCust, centers = 6, nstart = 25)
 
# Visualize the clusters
fviz_cluster(km6, data = MallCust)

# Visualize the clusters
fviz_cluster(km6, data = MallCust, 
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "green", "brown"), 
             ellipse.type = "euclid",  # Concentration ellipse
             star.plot = TRUE,         # Add segments from centroids to items
             repel = TRUE,             # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
    panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Mall Customer Data",
       subtitle = "Based on Public Domain Data ",
       caption = "Data source: Public Domain")

# Add Cluster Number to MallCust
MallCust$Cluster6 <- km6$cluster
MallCustRaw$Cluster6 <- km6$cluster

# saving the file 
# dev.off()

# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust6.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw6.csv"))

6.12 Check Whether Clusters Are Valid

# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)

# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised. 
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM4 <- KMEANS(
  MallCust,
  k = 5,
  criterion = "pseudo-F",
  graph = TRUE,
  nstart = 10
)