k-Means Clustering for
Simple Data
Get Data
# Get Data
df <- data.frame(X = c(1, 3, 4, 5, 1, 4, 1, 2),
Y = c(3, 3, 3, 3, 2, 2, 1, 1),
row.names = c("a", "b", "c", "d", "e", "f", "g", "h") )
# Write Data to WD
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
write.csv(df, file = "Simple.csv")
# Scale Data - No Need Since Data Have Similar Dimension
# numeric_cols <- sapply(Simple, is.numeric)
# Simple[, numeric_cols] <- scale(Simple[, numeric_cols])
Explore Data for
df
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(df)
## [1] "X" "Y"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(df)
## 'data.frame': 8 obs. of 2 variables:
## $ X: num 1 3 4 5 1 4 1 2
## $ Y: num 3 3 3 3 2 2 1 1
cat("\n\nAll Rows of Data Frame:\n")
##
##
## All Rows of Data Frame:
head(df, 8)
## X Y
## a 1 3
## b 3 3
## c 4 3
## d 5 3
## e 1 2
## f 4 2
## g 1 1
## h 2 1
cat("\n\nDescriptive Statistics of Columns in OriginalData Frame:\n")
##
##
## Descriptive Statistics of Columns in OriginalData Frame:
st(df, add.median = TRUE, out = "csv", col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 X 8 2.625 1.598 1 1 2.5 4 5 0
## 2 Y 8 2.25 0.8864 1 1.75 2.5 3 3 0
Plot Data
# Loading other packages if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
# Color selection
Colors <- c("orange", "green", "red", "blue", "yellow", "pink", "lightgreen", "lightblue")
# Show Data Frame
cat("\n\nShow df Data:\n")
##
##
## Show df Data:
plot(df$X, df$Y, type = "p",
main = "Dot Plot of df Data",
col = "#7c0c18",
pch = 19,
cex = 2.5,
xlab = "X",
ylab = "Y",
xlim = c(0.5, 5.5),
ylim = c(0.5, 3.5))
grid(nx = NULL, ny = NULL,
lty = 2, # Grid line type
col = "lightgrey", # Grid line color
lwd = 1) # Grid line width
# legend("topleft", fill = Colors, legend = c("a", "b", "c", "d", "e", "f", "g", "h"), horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")
text(df$X, df$Y, row.names(df), cex = 1.6, pos = 1, col = "#7c0c18")

Estimating the
Optimal Number of Clusters
# Plot Graph of Cluster Number vs SSq
fviz_nbclust(df, kmeans, method = "wss", linecolor = "red", k.max = 6
) +
# geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0)
) +
labs(title = "Number of Clusters and Respective Sum of Squares",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain",
col = Colors, pch = 19,
x = "Number of Clusters",
y = "Total Within Sum of Squares"
)

Build Clusters
# Calculate Cluster Data
km2 <- kmeans(df, centers = 2, nstart = 25)
km3 <- kmeans(df, centers = 3, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
print(km2)
## K-means clustering with 2 clusters of sizes 4, 4
##
## Cluster means:
## X Y
## 1 4.00 2.75
## 2 1.25 1.75
##
## Clustering vector:
## a b c d e f g h
## 2 1 1 1 2 1 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2.75 3.50
## (between_SS / total_SS = 73.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(df, Cluster = km2$cluster)
# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
cat("\nCluster", i, ":\n")
print(clustered_data[clustered_data$Cluster == i, ])
cat("\n")
}
##
## Cluster 1 :
## X Y Cluster
## b 3 3 1
## c 4 3 1
## d 5 3 1
## f 4 2 1
##
##
## Cluster 2 :
## X Y Cluster
## a 1 3 2
## e 1 2 2
## g 1 1 2
## h 2 1 2
# List Means with Original Data
aggregate(df, by = list(cluster = km2$cluster), mean)
## cluster X Y
## 1 1 4.00 2.75
## 2 2 1.25 1.75
Visualise Clusters
with 2 Centres
# Visualize k-means clusters
fviz_cluster(km2, data = df, geom = "point",
palette = c("#2E9FDF", "#FC4E07", "green"),
ellipse.type = "euclid", # Concentration ellipse (can be euclid, norm)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
legend.title = "Clusters",
stand = FALSE
) +
theme(
panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 15),
plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain"
)

Visualise Clusters
with 3 Centres
# Visualize k-means clusters
fviz_cluster(km3, data = df, geom = "point",
palette = c("#2E9FDF", "#FC4E07", "green"),
ellipse.type = "euclid", # Concentration ellipse (can be euclid, norm)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
legend.title = "Clusters",
stand = FALSE
) +
theme(
panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 15),
plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain"
)

Check Whether
Clusters Are Valid
# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)
# Standardize
df <- scale(df)
# K-means clustering
km.res <- eclust(df, "kmeans", k = 2, nstart = 25, graph = FALSE)
# Hierarchical clustering
hc.res <- eclust(df, "hclust", k = 2, hc_metric = "euclidean",
hc_method = "ward.D2", graph = FALSE)
# Silhouette information
silinfo <- km.res$silinfo
names(silinfo)
## [1] "widths" "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## cluster neighbor sil_width
## g 1 2 0.5172
## e 1 2 0.4436
## h 1 2 0.4051
## a 1 2 0.0240
## c 2 1 0.6689
## d 2 1 0.6345
## b 2 1 0.4626
## f 2 1 0.3775
Plot Cluster
Silhouette Width
# Visualise Silhouettes
fviz_silhouette(km.res, data = df, geom = "point",
palette = c("#2E9FDF", "#FC4E07"),
ellipse.type = "t", # Concentration ellipse (can be euclid)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
legend.title = "Clusters",
stand = FALSE
) +
theme(
panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 15),
plot.y = element_text(size = 15),
) +
labs(title = "Clusters Plot of Simple Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain"
)
## cluster size ave.sil.width
## 1 1 4 0.35
## 2 2 4 0.54

Check Whether
Clusters Are Valid with R-square and Pseudo-F
# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)
# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised.
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
# Run K-means for Clustering
KM2 <- KMEANS(
df,
k = 4,
criterion = "pseudo-F",
graph = TRUE,
main = "Number of Clusters vs pseudo-F and R-Square"
)

Check Whether
Clusters Are Valid with R-square and Pseudo-F
# Load Library if not available
if(! "clusterSim" %in% installed.packages()) { install.packages("clusterSim", dependencies = TRUE) }
library(clusterSim)
# Perform Clustering
km2 <- kmeans(df, centers = 2, nstart = 25)
km3 <- kmeans(df, centers = 3, nstart = 25)
# Calculate Pseudo-F of 2 Clusters
NumClus <- 2
NumObs <- nrow(df)
pseudo_F <- pseudoF(km2)
p_value <- pf(q = pseudo_F, df1 = NumClus - 1, df2 = NumObs - NumClus, lower.tail = FALSE)
Rsquare <- (km2$totss - km2$tot.withinss) / km2$totss
cat("\nK-means Clustering Overview for 2 Clusters")
##
## K-means Clustering Overview for 2 Clusters
cat("\nPseudo-F for 2 Clusters is: ", pseudo_F)
##
## Pseudo-F for 2 Clusters is: 9.186
cat("\nP-Value for pseudo-F for 2 Clusters is: ", p_value)
##
## P-Value for pseudo-F for 2 Clusters is: 0.02307
cat("\nR-square for 2 Clusters: ", Rsquare)
##
## R-square for 2 Clusters: 0.6049
# Calculate Pseudo-F of 3 Clusters
NumClus <- 3
NumObs <- nrow(df)
pseudo_F <- pseudoF(km3)
p_value <- pf(q = pseudo_F, df1 = NumClus - 1, df2 = NumObs - NumClus, lower.tail = FALSE)
Rsquare <- (km3$totss - km3$tot.withinss) / km3$totss
cat("\n\nK-means Clustering Overview for 3 Clusters")
##
##
## K-means Clustering Overview for 3 Clusters
cat("\nPseudo-F for 3 Clusters is: ", pseudo_F)
##
## Pseudo-F for 3 Clusters is: 11.12
cat("\nP-Value for pseudo-F for 3 Clusters is: ", p_value)
##
## P-Value for pseudo-F for 3 Clusters is: 0.01444
cat("\nR-square for 3 Clusters: ", Rsquare)
##
## R-square for 3 Clusters: 0.8164
cat("\n")
k-Means Clustering for
IRIS Data
Get Data
# Download Training Data from URL
Explore Data for
Iris
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Iris)
## [1] "SepalLength" "SepalWidth" "PetalLength" "PetalWidth" "Species"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Iris)
## 'data.frame': 150 obs. of 5 variables:
## $ SepalLength: num -0.898 -1.139 -1.381 -1.501 -1.018 ...
## $ SepalWidth : num 1.0156 -0.1315 0.3273 0.0979 1.245 ...
## $ PetalLength: num -1.34 -1.34 -1.39 -1.28 -1.34 ...
## $ PetalWidth : num -1.31 -1.31 -1.31 -1.31 -1.31 ...
## $ Species : Factor w/ 3 levels "Setosa","Versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Iris, 5)
## SepalLength SepalWidth PetalLength PetalWidth Species
## 1 -0.8977 1.01560 -1.336 -1.311 Setosa
## 2 -1.1392 -0.13154 -1.336 -1.311 Setosa
## 3 -1.3807 0.32732 -1.392 -1.311 Setosa
## 4 -1.5015 0.09789 -1.279 -1.311 Setosa
## 5 -1.0184 1.24503 -1.336 -1.311 Setosa
cat("\n\nDescriptive Statistics of Columns in OriginalData Frame:\n")
##
##
## Descriptive Statistics of Columns in OriginalData Frame:
st(IrisRaw, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 SepalLength 150 5.843 0.8281 4.3 5.1 5.8 6.4 7.9 0
## 2 SepalWidth 150 3.057 0.4359 2 2.8 3 3.3 4.4 0
## 3 PetalLength 150 3.758 1.765 1 1.6 4.35 5.1 6.9 0
## 4 PetalWidth 150 1.199 0.7622 0.1 0.3 1.3 1.8 2.5 0
## 5 Species 150
## 6 ... Setosa 50 33.33%
## 7 ... Versicolor 50 33.33%
## 8 ... Virginica 50 33.33%
cat("\n\nDescriptive Statistics of Columns in Scaled Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Scaled Data Frame:
st(Iris, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75
## 1 SepalLength 150 -0.0000000000000004484 1 -1.864 -0.8977 -0.05233 0.6722
## 2 SepalWidth 150 0.0000000000000002034 1 -2.426 -0.5904 -0.1315 0.5567
## 3 PetalLength 150 -0.00000000000000002895 1 -1.562 -1.222 0.3354 0.7602
## 4 PetalWidth 150 -0.00000000000000003663 1 -1.442 -1.18 0.1321 0.788
## 5 Species 150
## 6 ... Setosa 50 33.33%
## 7 ... Versicolor 50 33.33%
## 8 ... Virginica 50 33.33%
## Max NA Mode
## 1 2.484 0
## 2 3.08 0
## 3 1.78 0
## 4 1.706 0
## 5
## 6
## 7
## 8
Plot Data
# Loading other packages if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
# Color selection
Colors <- c("orange",
"green",
"red")
# Show Characteristics of Data Frame
cat("\n\nShow Petal Length vs Petal Width:\n")
##
##
## Show Petal Length vs Petal Width:
plot(Iris$SepalLength, Iris$SepalWidth, type = "p",
main = "Dot Plot of Iris Data",
sub = "Based on Public Domain Data",
caption = "Data source: Public Domain",
col = Colors[factor(Iris$Species)],
pch = 19,
xlab = "Petal Length",
ylab = "Petal Width")
grid(nx = NULL, ny = NULL,
lty = 2, # Grid line type
col = "lightgrey", # Grid line color
lwd = 1) # Grid line width
legend("topleft", fill = Colors, legend = c("Setosa", "Versicolor", "Virginica"),
horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")

Estimating the
Optimal Number of Clusters
# Plot Graph of Cluster Number vs SSq
fviz_nbclust(Iris[, c(1:4)], kmeans, method = "wss", linecolor = "red"
) +
# geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0)
) +
labs(title = "Number of Clusters and Respective Sum of Squares",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain",
col = Colors[factor(Iris$Species)], pch = 19,
x = "Number of Clusters",
y = "Total Within Sum of Squares"
)

Compute Cluster
Indicators
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Calculate Correlations
dist.cor <- get_dist(Iris[, c(1:4)], method = "pearson")
# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
## 1 2 3 4 5 6 7 8
## 1 0 0 0 0.0 0 0.0 0 0
## 2 0 0 0 0.0 0 0.0 0 0
## 3 0 0 0 0.0 0 0.0 0 0
## 4 0 0 0 0.0 0 0.1 0 0
## 5 0 0 0 0.0 0 0.0 0 0
## 6 0 0 0 0.1 0 0.0 0 0
## 7 0 0 0 0.0 0 0.0 0 0
## 8 0 0 0 0.0 0 0.0 0 0
# Display Graph
fviz_dist(dist.cor)

Compute Cluster
Details for 4 Clusters
# Calculate Cluster Data
km5 <- kmeans(Iris[, c(1:4)], centers = 5, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
print(km5)
## K-means clustering with 5 clusters of sizes 28, 22, 23, 48, 29
##
## Cluster means:
## SepalLength SepalWidth PetalLength PetalWidth
## 1 -0.7467 1.4253 -1.2933 -1.21734
## 2 -1.3478 0.1187 -1.3100 -1.29316
## 3 -0.3516 -1.3286 0.1026 0.01228
## 4 0.3804 -0.3896 0.6068 0.56391
## 5 1.3927 0.2324 1.1567 1.21328
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 2 2 1 1 2 1 2 2 1 2 2 2 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 2 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 2 2 1 1 2 1 2 1 2 5 4 5 3 4 4 4 3 4 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 4 3 4 3 4 4 3 3 3 4 4 4 4 4 4 4 4 4 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 3 3 4 4 4 4 3 4 3 3 4 3 3 3 4 4 4 3 3
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 5 4 5 4 5 5 3 5 4 5 5 4 5 4 4 5 4 5 5 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 5 4 5 4 5 5 4 4 4 5 5 5 4 4 4 5 5 4 4 5
## 141 142 143 144 145 146 147 148 149 150
## 5 5 4 5 5 5 4 4 5 4
##
## Within cluster sum of squares by cluster:
## [1] 13.762 8.033 13.687 27.830 26.891
## (between_SS / total_SS = 84.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(IrisRaw[,1:4], Cluster = km5$cluster)
# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
cat("\nCluster", i, ":\n")
print(clustered_data[clustered_data$Cluster == i, ])
cat("\n")
}
##
## Cluster 1 :
## SepalLength SepalWidth PetalLength PetalWidth Cluster
## 1 5.1 3.5 1.4 0.2 1
## 5 5.0 3.6 1.4 0.2 1
## 6 5.4 3.9 1.7 0.4 1
## 8 5.0 3.4 1.5 0.2 1
## 11 5.4 3.7 1.5 0.2 1
## 15 5.8 4.0 1.2 0.2 1
## 16 5.7 4.4 1.5 0.4 1
## 17 5.4 3.9 1.3 0.4 1
## 18 5.1 3.5 1.4 0.3 1
## 19 5.7 3.8 1.7 0.3 1
## 20 5.1 3.8 1.5 0.3 1
## 21 5.4 3.4 1.7 0.2 1
## 22 5.1 3.7 1.5 0.4 1
## 23 4.6 3.6 1.0 0.2 1
## 27 5.0 3.4 1.6 0.4 1
## 28 5.2 3.5 1.5 0.2 1
## 29 5.2 3.4 1.4 0.2 1
## 32 5.4 3.4 1.5 0.4 1
## 33 5.2 4.1 1.5 0.1 1
## 34 5.5 4.2 1.4 0.2 1
## 37 5.5 3.5 1.3 0.2 1
## 38 4.9 3.6 1.4 0.1 1
## 40 5.1 3.4 1.5 0.2 1
## 41 5.0 3.5 1.3 0.3 1
## 44 5.0 3.5 1.6 0.6 1
## 45 5.1 3.8 1.9 0.4 1
## 47 5.1 3.8 1.6 0.2 1
## 49 5.3 3.7 1.5 0.2 1
##
##
## Cluster 2 :
## SepalLength SepalWidth PetalLength PetalWidth Cluster
## 2 4.9 3.0 1.4 0.2 2
## 3 4.7 3.2 1.3 0.2 2
## 4 4.6 3.1 1.5 0.2 2
## 7 4.6 3.4 1.4 0.3 2
## 9 4.4 2.9 1.4 0.2 2
## 10 4.9 3.1 1.5 0.1 2
## 12 4.8 3.4 1.6 0.2 2
## 13 4.8 3.0 1.4 0.1 2
## 14 4.3 3.0 1.1 0.1 2
## 24 5.1 3.3 1.7 0.5 2
## 25 4.8 3.4 1.9 0.2 2
## 26 5.0 3.0 1.6 0.2 2
## 30 4.7 3.2 1.6 0.2 2
## 31 4.8 3.1 1.6 0.2 2
## 35 4.9 3.1 1.5 0.2 2
## 36 5.0 3.2 1.2 0.2 2
## 39 4.4 3.0 1.3 0.2 2
## 42 4.5 2.3 1.3 0.3 2
## 43 4.4 3.2 1.3 0.2 2
## 46 4.8 3.0 1.4 0.3 2
## 48 4.6 3.2 1.4 0.2 2
## 50 5.0 3.3 1.4 0.2 2
##
##
## Cluster 3 :
## SepalLength SepalWidth PetalLength PetalWidth Cluster
## 54 5.5 2.3 4.0 1.3 3
## 58 4.9 2.4 3.3 1.0 3
## 60 5.2 2.7 3.9 1.4 3
## 61 5.0 2.0 3.5 1.0 3
## 63 6.0 2.2 4.0 1.0 3
## 65 5.6 2.9 3.6 1.3 3
## 68 5.8 2.7 4.1 1.0 3
## 69 6.2 2.2 4.5 1.5 3
## 70 5.6 2.5 3.9 1.1 3
## 80 5.7 2.6 3.5 1.0 3
## 81 5.5 2.4 3.8 1.1 3
## 82 5.5 2.4 3.7 1.0 3
## 83 5.8 2.7 3.9 1.2 3
## 88 6.3 2.3 4.4 1.3 3
## 90 5.5 2.5 4.0 1.3 3
## 91 5.5 2.6 4.4 1.2 3
## 93 5.8 2.6 4.0 1.2 3
## 94 5.0 2.3 3.3 1.0 3
## 95 5.6 2.7 4.2 1.3 3
## 99 5.1 2.5 3.0 1.1 3
## 100 5.7 2.8 4.1 1.3 3
## 107 4.9 2.5 4.5 1.7 3
## 120 6.0 2.2 5.0 1.5 3
##
##
## Cluster 4 :
## SepalLength SepalWidth PetalLength PetalWidth Cluster
## 52 6.4 3.2 4.5 1.5 4
## 55 6.5 2.8 4.6 1.5 4
## 56 5.7 2.8 4.5 1.3 4
## 57 6.3 3.3 4.7 1.6 4
## 59 6.6 2.9 4.6 1.3 4
## 62 5.9 3.0 4.2 1.5 4
## 64 6.1 2.9 4.7 1.4 4
## 66 6.7 3.1 4.4 1.4 4
## 67 5.6 3.0 4.5 1.5 4
## 71 5.9 3.2 4.8 1.8 4
## 72 6.1 2.8 4.0 1.3 4
## 73 6.3 2.5 4.9 1.5 4
## 74 6.1 2.8 4.7 1.2 4
## 75 6.4 2.9 4.3 1.3 4
## 76 6.6 3.0 4.4 1.4 4
## 77 6.8 2.8 4.8 1.4 4
## 78 6.7 3.0 5.0 1.7 4
## 79 6.0 2.9 4.5 1.5 4
## 84 6.0 2.7 5.1 1.6 4
## 85 5.4 3.0 4.5 1.5 4
## 86 6.0 3.4 4.5 1.6 4
## 87 6.7 3.1 4.7 1.5 4
## 89 5.6 3.0 4.1 1.3 4
## 92 6.1 3.0 4.6 1.4 4
## 96 5.7 3.0 4.2 1.2 4
## 97 5.7 2.9 4.2 1.3 4
## 98 6.2 2.9 4.3 1.3 4
## 102 5.8 2.7 5.1 1.9 4
## 104 6.3 2.9 5.6 1.8 4
## 109 6.7 2.5 5.8 1.8 4
## 112 6.4 2.7 5.3 1.9 4
## 114 5.7 2.5 5.0 2.0 4
## 115 5.8 2.8 5.1 2.4 4
## 117 6.5 3.0 5.5 1.8 4
## 122 5.6 2.8 4.9 2.0 4
## 124 6.3 2.7 4.9 1.8 4
## 127 6.2 2.8 4.8 1.8 4
## 128 6.1 3.0 4.9 1.8 4
## 129 6.4 2.8 5.6 2.1 4
## 133 6.4 2.8 5.6 2.2 4
## 134 6.3 2.8 5.1 1.5 4
## 135 6.1 2.6 5.6 1.4 4
## 138 6.4 3.1 5.5 1.8 4
## 139 6.0 3.0 4.8 1.8 4
## 143 5.8 2.7 5.1 1.9 4
## 147 6.3 2.5 5.0 1.9 4
## 148 6.5 3.0 5.2 2.0 4
## 150 5.9 3.0 5.1 1.8 4
##
##
## Cluster 5 :
## SepalLength SepalWidth PetalLength PetalWidth Cluster
## 51 7.0 3.2 4.7 1.4 5
## 53 6.9 3.1 4.9 1.5 5
## 101 6.3 3.3 6.0 2.5 5
## 103 7.1 3.0 5.9 2.1 5
## 105 6.5 3.0 5.8 2.2 5
## 106 7.6 3.0 6.6 2.1 5
## 108 7.3 2.9 6.3 1.8 5
## 110 7.2 3.6 6.1 2.5 5
## 111 6.5 3.2 5.1 2.0 5
## 113 6.8 3.0 5.5 2.1 5
## 116 6.4 3.2 5.3 2.3 5
## 118 7.7 3.8 6.7 2.2 5
## 119 7.7 2.6 6.9 2.3 5
## 121 6.9 3.2 5.7 2.3 5
## 123 7.7 2.8 6.7 2.0 5
## 125 6.7 3.3 5.7 2.1 5
## 126 7.2 3.2 6.0 1.8 5
## 130 7.2 3.0 5.8 1.6 5
## 131 7.4 2.8 6.1 1.9 5
## 132 7.9 3.8 6.4 2.0 5
## 136 7.7 3.0 6.1 2.3 5
## 137 6.3 3.4 5.6 2.4 5
## 140 6.9 3.1 5.4 2.1 5
## 141 6.7 3.1 5.6 2.4 5
## 142 6.9 3.1 5.1 2.3 5
## 144 6.8 3.2 5.9 2.3 5
## 145 6.7 3.3 5.7 2.5 5
## 146 6.7 3.0 5.2 2.3 5
## 149 6.2 3.4 5.4 2.3 5
# List Means with Original Data
aggregate(IrisRaw[,1:4], by = list(cluster = km5$cluster), mean)
## cluster SepalLength SepalWidth PetalLength PetalWidth
## 1 1 5.225 3.679 1.475 0.2714
## 2 2 4.727 3.109 1.445 0.2136
## 3 3 5.552 2.478 3.939 1.2087
## 4 4 6.158 2.888 4.829 1.6292
## 5 5 6.997 3.159 5.800 2.1241
Determining Optimal
Number of Clusters
Defining Number of
Clusters with Sum of Squares
# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)
# Elbow method
fviz_nbclust(Iris[, c(1:4)], kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0)
) +
labs(title = "Elbow Method",
subtitle = "Based on IRIS Data",
caption = "Data source: Public Domain",
col = Colors[factor(Iris$Species)], pch = 19,
x = "Number of Clusters",
y = "Total Within Sum of Squares"
)

Defining Number of
Clusters with Silhouette Analysis
# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)
# Silhouette Analysis
fviz_nbclust(Iris[, 1:4], kmeans, method = "silhouette") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0)
) +
labs(title = "Silhouette Method",
subtitle = "Silhouette method - IRIS Data",
caption = "Data source: Public Domain",
col = Colors[factor(Iris$Species)], pch = 19,
x = "Number of Clusters",
y = "Average Silhouette Width"
)

Defining Number of
Clusters with Gap Statistic
# Loading other packages if not available
if(! "cluster" %in% installed.packages()) { install.packages("cluster", dependencies = TRUE) }
library(cluster)
# Silhouette Analysis
set.seed(123)
fviz_nbclust(Iris[, 1:4], kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0)
) +
labs(title = "Gap Statistic Method",
subtitle = "Silhouette method - IRIS Data",
caption = "Data source: Public Domain",
col = Colors[factor(Iris$Species)], pch = 19,
x = "Number of Clusters",
y = "Gap Statistic (k)"
)

Determining the
Optimal Number of Clusters with NbClust
# NbClust Analysis
nb <- NbClust(Iris[, 1:4], distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
Determining the
Optimal Number of Clusters out of 30 Indicators
# Using NbClust Analysis
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 11 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .

Build Clusters
# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)
# K-means clustering
km2 <- eclust(Iris[, c(1:4)], "kmeans", k = 2, nstart = 25, graph = FALSE)
km3 <- eclust(Iris[, c(1:4)], "kmeans", k = 3, nstart = 25, graph = FALSE)
km4 <- eclust(Iris[, c(1:4)], "kmeans", k = 4, nstart = 25, graph = FALSE)
# Visualize k-means clusters
fviz_cluster(km2, data = Iris, geom = "point",
palette = c("#2E9FDF", "#FC4E07"),
ellipse.type = "norm", # Concentration ellipse (can be euclid)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain")

fviz_cluster(km3, data = Iris, geom = "point",
palette = c("#2E9FDF", "#FC4E07", "green"),
ellipse.type = "norm", # Concentration ellipse (can be euclid)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain")

fviz_cluster(km4, data = Iris, geom = "point",
palette = c("#2E9FDF", "#FC4E07", "green", "purple"),
ellipse.type = "norm", # Concentration ellipse (can be euclid)
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Iris Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain")

# Hierarchical clustering
hc.res <- eclust(Iris, "hclust", k = 3, hc_metric = "euclidean", hc_method = "ward.D2", graph = FALSE)
hc.res
##
## Call:
## stats::hclust(d = x, method = hc_method)
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 150
# Visualize dendrograms
# fviz_dend(hc.res, show_labels = FALSE, palette = "jco", as.ggplot = TRUE)
Cluster Validation
with Silhouette
# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)
# Plot Cluster Silhouette
fviz_silhouette(km3, palette = "jco", ggtheme = theme_classic()) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
# panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
axis.text.x = element_text(color = "black", size = 12),
) +
scale_x_discrete(breaks = seq(10, 150, by = 20)) +
labs(title = "Clusters Silhouette Plot of Iris Data",
subtitle = "Based on Iris Data",
caption = "Data source: Public Domain")
## cluster size ave.sil.width
## 1 1 50 0.64
## 2 2 53 0.39
## 3 3 47 0.35

# Silhouette information
silinfo <- km3$silinfo
names(silinfo)
## [1] "widths" "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## cluster neighbor sil_width
## 1 1 2 0.7342
## 41 1 2 0.7333
## 8 1 2 0.7308
## 18 1 2 0.7288
## 5 1 2 0.7285
## 40 1 2 0.7247
## 38 1 2 0.7244
## 12 1 2 0.7218
## 28 1 2 0.7215
## 29 1 2 0.7145
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## [1] 0.6363 0.3934 0.3474
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## [1] 0.4599
# The size of each clusters
km3$size
## [1] 50 53 47
# Silhouette width of observation
sil <- km3$silinfo$widths[, 1:3]
cat("\n\nShow Silhouette With per Data Point. The higher the better:\n")
##
##
## Show Silhouette With per Data Point. The higher the better:
sil
## cluster neighbor sil_width
## 1 1 2 0.73419
## 41 1 2 0.73333
## 8 1 2 0.73082
## 18 1 2 0.72875
## 5 1 2 0.72847
## 40 1 2 0.72470
## 38 1 2 0.72442
## 12 1 2 0.72179
## 28 1 2 0.72151
## 29 1 2 0.71452
## 50 1 2 0.71077
## 27 1 2 0.70471
## 25 1 2 0.70132
## 7 1 2 0.69838
## 23 1 2 0.69653
## 22 1 2 0.69021
## 49 1 2 0.68912
## 47 1 2 0.67882
## 3 1 2 0.67755
## 20 1 2 0.67746
## 36 1 2 0.67615
## 11 1 2 0.67418
## 30 1 2 0.66776
## 48 1 2 0.66606
## 37 1 2 0.66543
## 44 1 2 0.66372
## 21 1 2 0.66043
## 45 1 2 0.64864
## 32 1 2 0.64783
## 24 1 2 0.63738
## 43 1 2 0.63726
## 10 1 2 0.63154
## 35 1 2 0.62844
## 31 1 2 0.62562
## 17 1 2 0.62112
## 4 1 2 0.62050
## 6 1 2 0.60988
## 33 1 2 0.58967
## 19 1 2 0.58590
## 13 1 2 0.57847
## 2 1 2 0.56827
## 46 1 2 0.55935
## 15 1 2 0.55295
## 39 1 2 0.55262
## 14 1 2 0.54945
## 26 1 2 0.54495
## 34 1 2 0.54088
## 9 1 2 0.48821
## 16 1 2 0.45807
## 42 1 2 0.07767
## 90 2 3 0.58630
## 91 2 3 0.58417
## 95 2 3 0.58398
## 70 2 3 0.58259
## 93 2 3 0.58008
## 83 2 3 0.56859
## 81 2 3 0.56668
## 82 2 3 0.55698
## 80 2 3 0.55591
## 100 2 3 0.55292
## 68 2 3 0.54918
## 60 2 3 0.54390
## 54 2 3 0.54374
## 56 2 3 0.53879
## 97 2 3 0.50035
## 65 2 3 0.49518
## 107 2 3 0.47008
## 63 2 3 0.46947
## 72 2 3 0.45853
## 89 2 3 0.44045
## 94 2 1 0.43897
## 99 2 1 0.43311
## 96 2 3 0.43068
## 74 2 3 0.42610
## 61 2 1 0.42467
## 88 2 3 0.41830
## 120 2 3 0.41626
## 58 2 1 0.40978
## 85 2 3 0.40498
## 69 2 3 0.40351
## 67 2 3 0.39250
## 84 2 3 0.39024
## 114 2 3 0.37967
## 79 2 3 0.37448
## 73 2 3 0.36639
## 135 2 3 0.34505
## 102 2 3 0.34345
## 143 2 3 0.34345
## 62 2 3 0.34089
## 98 2 3 0.33949
## 64 2 3 0.33188
## 122 2 3 0.31781
## 92 2 3 0.24126
## 147 2 3 0.22992
## 75 2 3 0.21912
## 134 2 3 0.19544
## 124 2 3 0.17745
## 127 2 3 0.17430
## 55 2 3 0.13271
## 150 2 3 0.09788
## 139 2 3 0.08594
## 115 2 3 0.05779
## 59 2 3 0.03769
## 121 3 2 0.54976
## 144 3 2 0.54298
## 140 3 2 0.53818
## 125 3 2 0.52569
## 103 3 2 0.52208
## 126 3 2 0.51434
## 142 3 2 0.51143
## 141 3 2 0.50862
## 145 3 2 0.50445
## 113 3 2 0.50147
## 106 3 2 0.47219
## 136 3 2 0.46885
## 110 3 2 0.46638
## 146 3 2 0.46228
## 111 3 2 0.44616
## 116 3 2 0.44552
## 108 3 2 0.44361
## 105 3 2 0.43894
## 130 3 2 0.43638
## 101 3 2 0.42464
## 137 3 2 0.42243
## 131 3 2 0.41286
## 118 3 2 0.41132
## 123 3 2 0.40084
## 132 3 2 0.39874
## 149 3 2 0.38714
## 148 3 2 0.38121
## 138 3 2 0.36094
## 53 3 2 0.35855
## 117 3 2 0.35179
## 78 3 2 0.34222
## 51 3 2 0.34170
## 119 3 2 0.33087
## 87 3 2 0.27954
## 133 3 2 0.24284
## 57 3 2 0.23207
## 129 3 2 0.22254
## 66 3 2 0.18907
## 52 3 2 0.16628
## 104 3 2 0.15133
## 86 3 2 0.10973
## 76 3 2 0.05112
## 77 3 2 0.04064
## 71 3 2 0.03645
## 109 3 2 0.01676
## 112 3 2 -0.01058
## 128 3 2 -0.02489
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width
## 112 3 2 -0.01058
## 128 3 2 -0.02489
# Statistics for k-means clustering
km_stats <- cluster.stats(dist(Iris), km3$cluster)
# Show Cluster Index
cat("\n\nShow Overview of Cluster Characteristics:\n")
##
##
## Show Overview of Cluster Characteristics:
cat("- Cluster Sizes ($cluster.size): The number of data points in each cluster.\n")
## - Cluster Sizes ($cluster.size): The number of data points in each cluster.
cat("- Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.\n")
## - Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.
cat("- Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.\n")
## - Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.
cat("- Average Distances Between Clusters ($average.between): The average distance between points in different clusters.\n")
## - Average Distances Between Clusters ($average.between): The average distance between points in different clusters.
cat("- Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.\n")
## - Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.
cat("- Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.\n")
## - Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.
cat("- Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.\n")
## - Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.
cat("- Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.\n")
## - Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.
cat("- Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.\n\n")
## - Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
km_stats
## $n
## [1] 150
##
## $cluster.number
## [1] 3
##
## $cluster.size
## [1] 50 53 47
##
## $min.cluster.size
## [1] 47
##
## $noisen
## [1] 0
##
## $diameter
## [1] 5.628 3.267 3.738
##
## $average.distance
## [1] 1.314 1.338 1.462
##
## $median.distance
## [1] 1.105 1.292 1.385
##
## $separation
## [1] 1.7367 0.1491 0.1491
##
## $average.toother
## [1] 4.078 2.990 3.445
##
## $separation.matrix
## [,1] [,2] [,3]
## [1,] 0.000 1.7367 2.7001
## [2,] 1.737 0.0000 0.1491
## [3,] 2.700 0.1491 0.0000
##
## $ave.between.matrix
## [,1] [,2] [,3]
## [1,] 0.000 3.601 4.617
## [2,] 3.601 0.000 2.340
## [3,] 4.617 2.340 0.000
##
## $average.between
## [1] 3.5
##
## $average.within
## [1] 1.369
##
## $n.between
## [1] 7491
##
## $n.within
## [1] 3684
##
## $max.diameter
## [1] 5.628
##
## $min.separation
## [1] 0.1491
##
## $within.cluster.ss
## [1] 173.6
##
## $clus.avg.silwidths
## 1 2 3
## 0.6363 0.3934 0.3474
##
## $avg.silwidth
## [1] 0.4599
##
## $g2
## NULL
##
## $g3
## NULL
##
## $pearsongamma
## [1] 0.6797
##
## $dunn
## [1] 0.0265
##
## $dunn2
## [1] 1.6
##
## $entropy
## [1] 1.097
##
## $wb.ratio
## [1] 0.3911
##
## $ch
## [1] 241.9
##
## $cwidegap
## [1] 1.5532 0.8748 1.0546
##
## $widestgap
## [1] 1.553
##
## $sindex
## [1] 0.3941
##
## $corrected.rand
## NULL
##
## $vi
## NULL
# Show Cluster-Species-Relationship
cat("\n\nShow Cluster-Category-Membership:\n")
##
##
## Show Cluster-Category-Membership:
table(Iris$Species, km3$cluster)
##
## 1 2 3
## Setosa 50 0 0
## Versicolor 0 39 11
## Virginica 0 14 36
Check Whether
Clusters Are Valid Using R-Square and Pseudo-F
# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)
# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised.
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM2 <- KMEANS(
Iris[, c(1:4)],
# k = 2,
criterion = "pseudo-F",
graph = TRUE,
nstart = 10,
main = "Number of Clusters vs pseudo-F and R-Square"
)

# Calculate Pseudo-F of Clusters
#cat("\nK-means Clustering Overview:\n\n")
#pseudoF(KM2)
Check Agreement
between Species and PAM/HC Clusters
# Load Library if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Agreement between species and pam clusters
pam.res <- eclust(Iris[, c(1:4)], "pam", k = 4, graph = FALSE)
table(Iris$Species, km3$cluster)
##
## 1 2 3
## Setosa 50 0 0
## Versicolor 0 39 11
## Virginica 0 14 36
# cluster.stats(d = dist(Iris, Iris$Species, pam.res$cluster)$vi)
K-Means Clustering for
US Arrest Data
Get Data
# Download Training Data from URL
Explore Data for
USarrests
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(USarrests)
## [1] "Murder" "Assault" "UrbanPop" "Rape"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(USarrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(USarrests, 5)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(USarrests, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 Murder 50 7.788 4.356 0.8 4.075 7.25 11.25 17.4 0
## 2 Assault 50 170.8 83.34 45 109 159 249 337 0
## 3 UrbanPop 50 65.54 14.47 32 54.5 66 77.75 91 0
## 4 Rape 50 21.23 9.366 7.3 15.08 20.1 26.17 46 0
Clean and Scale
Data
# Omitting any NA values
USarrests <- na.omit(USarrests)
# Scale Data
numeric_cols <- sapply(USarrests, is.numeric)
USarrests[, numeric_cols] <- scale(USarrests[, numeric_cols])
# Write Data to WD
write.csv(USarrests, file = "USarrestsScaled.csv")
Estimating the
Optimal Number of Clusters
# Plot Graph of Cluster Number vs SSq
fviz_nbclust(USarrests, kmeans, method = "wss", linecolor = "red") +
# geom_vline(xintercept = 4, size = 1, linetype = 2, linecolor = "darkblue") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Number of Clusters and Respective Sum of Squares",
subtitle = "Based on Crime Data from USA",
caption = "Data source: Public Domain",
fill = "Components",
x = "Number of Clusters",
y = "Total Within Sum of Squares")

Compute Cluster
Indicators
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Calculate Correlations
dist.cor <- get_dist(USarrests, method = "pearson")
# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
## Alabama Alaska Arizona Arkansas California Colorado Connecticut
## Alabama 0.0 0.7 1.4 0.1 1.9 1.7 1.7
## Alaska 0.7 0.0 0.8 0.4 0.8 0.5 1.9
## Arizona 1.4 0.8 0.0 1.2 0.3 0.6 0.8
## Arkansas 0.1 0.4 1.2 0.0 1.6 1.4 1.9
## California 1.9 0.8 0.3 1.6 0.0 0.1 0.7
## Colorado 1.7 0.5 0.6 1.4 0.1 0.0 1.0
## Connecticut 1.7 1.9 0.8 1.9 0.7 1.0 0.0
## Delaware 1.1 1.5 0.3 1.2 0.9 1.4 0.5
## Delaware
## Alabama 1.1
## Alaska 1.5
## Arizona 0.3
## Arkansas 1.2
## California 0.9
## Colorado 1.4
## Connecticut 0.5
## Delaware 0.0
# Display Graph
fviz_dist(dist.cor)

Compute Cluster
Details for 4 Clusters
# Calculate Cluster Data
km4 <- kmeans(USarrests, centers = 4, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
print(km4)
## K-means clustering with 4 clusters of sizes 16, 8, 13, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.4894 -0.3826 0.5758 -0.26165
## 2 1.4119 0.8743 -0.8145 0.01927
## 3 -0.9615 -1.1066 -0.9301 -0.96676
## 4 0.6951 1.0394 0.7226 1.27694
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 4 4 2 4
## Colorado Connecticut Delaware Florida Georgia
## 4 1 1 4 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 3 4 1 3
## Kansas Kentucky Louisiana Maine Maryland
## 1 3 2 3 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 4 3 2 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 4 3 1
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 2 3 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 3 2 4 1 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 16.212 8.316 11.952 19.922
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(USarrestsRaw, Cluster = km4$cluster)
# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
cat("\nCluster", i, ":\n")
print(clustered_data[clustered_data$Cluster == i, ])
cat("\n")
}
##
## Cluster 1 :
## Murder Assault UrbanPop Rape Cluster
## Connecticut 3.3 110 77 11.1 1
## Delaware 5.9 238 72 15.8 1
## Hawaii 5.3 46 83 20.2 1
## Indiana 7.2 113 65 21.0 1
## Kansas 6.0 115 66 18.0 1
## Massachusetts 4.4 149 85 16.3 1
## New Jersey 7.4 159 89 18.8 1
## Ohio 7.3 120 75 21.4 1
## Oklahoma 6.6 151 68 20.0 1
## Oregon 4.9 159 67 29.3 1
## Pennsylvania 6.3 106 72 14.9 1
## Rhode Island 3.4 174 87 8.3 1
## Utah 3.2 120 80 22.9 1
## Virginia 8.5 156 63 20.7 1
## Washington 4.0 145 73 26.2 1
## Wyoming 6.8 161 60 15.6 1
##
##
## Cluster 2 :
## Murder Assault UrbanPop Rape Cluster
## Alabama 13.2 236 58 21.2 2
## Arkansas 8.8 190 50 19.5 2
## Georgia 17.4 211 60 25.8 2
## Louisiana 15.4 249 66 22.2 2
## Mississippi 16.1 259 44 17.1 2
## North Carolina 13.0 337 45 16.1 2
## South Carolina 14.4 279 48 22.5 2
## Tennessee 13.2 188 59 26.9 2
##
##
## Cluster 3 :
## Murder Assault UrbanPop Rape Cluster
## Idaho 2.6 120 54 14.2 3
## Iowa 2.2 56 57 11.3 3
## Kentucky 9.7 109 52 16.3 3
## Maine 2.1 83 51 7.8 3
## Minnesota 2.7 72 66 14.9 3
## Montana 6.0 109 53 16.4 3
## Nebraska 4.3 102 62 16.5 3
## New Hampshire 2.1 57 56 9.5 3
## North Dakota 0.8 45 44 7.3 3
## South Dakota 3.8 86 45 12.8 3
## Vermont 2.2 48 32 11.2 3
## West Virginia 5.7 81 39 9.3 3
## Wisconsin 2.6 53 66 10.8 3
##
##
## Cluster 4 :
## Murder Assault UrbanPop Rape Cluster
## Alaska 10.0 263 48 44.5 4
## Arizona 8.1 294 80 31.0 4
## California 9.0 276 91 40.6 4
## Colorado 7.9 204 78 38.7 4
## Florida 15.4 335 80 31.9 4
## Illinois 10.4 249 83 24.0 4
## Maryland 11.3 300 67 27.8 4
## Michigan 12.1 255 74 35.1 4
## Missouri 9.0 178 70 28.2 4
## Nevada 12.2 252 81 46.0 4
## New Mexico 11.4 285 70 32.1 4
## New York 11.1 254 86 26.1 4
## Texas 12.7 201 80 25.5 4
# List Means with Original Data
aggregate(USarrestsRaw, by = list(cluster = km4$cluster), mean)
## cluster Murder Assault UrbanPop Rape
## 1 1 5.656 138.88 73.88 18.78
## 2 2 13.938 243.62 53.75 21.41
## 3 3 3.600 78.54 52.08 12.18
## 4 4 10.815 257.38 76.00 33.19
Perform k-Means
Clustering
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# output to be present as PNG file
# png(file = "KMeansCluster.png")
# Generate k-Means Cluster
km4 <- kmeans(USarrests, centers = 4, nstart = 25)
# Visualize the clusters
fviz_cluster(km4, data = USarrests)

# saving the file
# dev.off()
# output to be present as PNG file
# png(file = "KMeansCluster2.png")
# Generate k-Means Cluster
km5 <- kmeans(USarrests, centers = 5, nstart = 25)
# Visualize the clusters
fviz_cluster(km5, data = USarrests)

# saving the file
# dev.off()
# output to be present as PNG file
# png(file = "KMeansCluster3.png")
# Generate k-Means Cluster
km6 <- kmeans(USarrests, centers = 6, nstart = 25)
# Visualize the clusters
fviz_cluster(km6, data = USarrests)

# Visualize the clusters
fviz_cluster(km4, data = USarrests,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Crime Data",
subtitle = "Based on Crime Data from USA",
caption = "Data source: Public Domain")

# saving the file
# dev.off()
Check Whether
Clusters Are Valid
# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)
# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised.
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM4 <- KMEANS(
USarrests,
k = 8,
criterion = "pseudo-F",
graph = TRUE,
nstart = 10
)

# Calculate Pseudo-F of Clusters
pseudoF(KM4)
## [1] 43.46
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
Cluster Validation
with Silhouette
# Loading other packages if not available
if(! "fpc" %in% installed.packages()) { install.packages("fpc", dependencies = TRUE) }
library(fpc)
# Generate k-Means Cluster
km3 <- kmeans(USarrests, centers = 3, nstart = 25)
# Silhouette information
silinfo <- km3$silinfo
names(silinfo)
## NULL
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## NULL
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## NULL
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## NULL
# The size of each clusters
km3$size
## [1] 17 20 13
# Silhouette width of observation
cat("\n\nShow Silhouette With per Data Point. The higher the better:\n")
##
##
## Show Silhouette With per Data Point. The higher the better:
sil <- km3$silinfo$widths[, 1:3]
sil
## NULL
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## NULL
# Statistics for k-means clustering
km_stats <- cluster.stats(dist(USarrests), km3$cluster)
# Show Cluster Index
cat("\n\nShow Overview of Cluster Characteristics:\n")
##
##
## Show Overview of Cluster Characteristics:
cat("- Cluster Sizes ($cluster.size): The number of data points in each cluster.\n")
## - Cluster Sizes ($cluster.size): The number of data points in each cluster.
cat("- Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.\n")
## - Cluster Diameter ($diameter): The maximum distance between any two points within a cluster.
cat("- Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.\n")
## - Average Distances Within Clusters ($average.distance): The average distance between points within the same cluster.
cat("- Average Distances Between Clusters ($average.between): The average distance between points in different clusters.\n")
## - Average Distances Between Clusters ($average.between): The average distance between points in different clusters.
cat("- Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.\n")
## - Cluster Separation ($separation.matrix): Measures how distinct or well-separated the clusters are from each other.
cat("- Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.\n")
## - Biggest Within Cluster Gap ($widestgap, $cwidegap): The largest distance between any two points within the same cluster.
cat("- Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.\n")
## - Average Silhouette Widths ($clus.avg.silwidths): A measure of how similar an object is to its own cluster compared to other clusters. Higher values indicate better clustering.
cat("- Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.\n")
## - Dunn Index ($dunn): The ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering.
cat("- Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.\n\n")
## - Corrected Rand Index ($corrected.rand): Measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
km_stats
## $n
## [1] 50
##
## $cluster.number
## [1] 3
##
## $cluster.size
## [1] 17 20 13
##
## $min.cluster.size
## [1] 13
##
## $noisen
## [1] 0
##
## $diameter
## [1] 3.088 4.420 2.448
##
## $average.distance
## [1] 1.470 2.067 1.314
##
## $median.distance
## [1] 1.441 1.937 1.269
##
## $separation
## [1] 0.5279 0.9787 0.5279
##
## $average.toother
## [1] 2.599 3.322 3.142
##
## $separation.matrix
## [,1] [,2] [,3]
## [1,] 0.0000 0.9787 0.5279
## [2,] 0.9787 0.0000 1.7490
## [3,] 0.5279 1.7490 0.0000
##
## $ave.between.matrix
## [,1] [,2] [,3]
## [1,] 0.000 2.853 2.207
## [2,] 2.853 0.000 3.936
## [3,] 2.207 3.936 0.000
##
## $average.between
## [1] 3.022
##
## $average.within
## [1] 1.668
##
## $n.between
## [1] 821
##
## $n.within
## [1] 404
##
## $max.diameter
## [1] 4.42
##
## $min.separation
## [1] 0.5279
##
## $within.cluster.ss
## [1] 78.32
##
## $clus.avg.silwidths
## 1 2 3
## 0.3179 0.2606 0.3735
##
## $avg.silwidth
## [1] 0.3094
##
## $g2
## NULL
##
## $g3
## NULL
##
## $pearsongamma
## [1] 0.5416
##
## $dunn
## [1] 0.1194
##
## $dunn2
## [1] 1.068
##
## $entropy
## [1] 1.084
##
## $wb.ratio
## [1] 0.5521
##
## $ch
## [1] 35.31
##
## $cwidegap
## [1] 1.1803 2.0581 0.9825
##
## $widestgap
## [1] 2.058
##
## $sindex
## [1] 0.7068
##
## $corrected.rand
## NULL
##
## $vi
## NULL
K-Means Clustering for
Mall Customers
Get Data
# Download Training Data from URL
Explore Data for Mall
Customers
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(MallCust)
## [1] "Gender" "Age" "AnnualIncome" "SpendingScore"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(MallCust)
## 'data.frame': 200 obs. of 4 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ AnnualIncome : int 15 15 16 16 17 17 18 18 19 19 ...
## $ SpendingScore: int 39 81 6 77 40 76 6 94 3 72 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(MallCust, 5)
## Gender Age AnnualIncome SpendingScore
## 1 Male 19 15 39
## 2 Male 21 15 81
## 3 Female 20 16 6
## 4 Female 23 16 77
## 5 Female 31 17 40
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(MallCust, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 Gender 200
## 2 ... Female 112 56%
## 3 ... Male 88 44%
## 4 Age 200 38.85 13.97 18 28.75 36 49 70 0
## 5 AnnualIncome 200 60.56 26.26 15 41.5 61.5 78 137 0
## 6 SpendingScore 200 50.2 25.82 1 34.75 50 73 99 0
# Plot Income vs Spending Score by Gender
ggplot(MallCust, aes(x = AnnualIncome, y = SpendingScore, color = Gender)) +
geom_point() +
labs(title = "Income vs Spending Score by Gender",
subtitle = "Based on Public Domain Data ",
caption = "Data source: Public Domain") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 22),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 12),
plot.x = element_text(size = 0),
legend.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 12),
axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.title = element_text(size = 14, colour = "black"),
)

Clean and Scale
Data
# Omitting any NA values
MallCust <- na.omit(MallCust)
# Remove rows with NA, NaN, or Inf
MallCust <- MallCust[complete.cases(MallCust), ]
# Convert Gender column to numeric: Male -> 1, Female -> 0
MallCust$Gender <- ifelse(MallCust$Gender == "Male", 1, 0)
# Scale Data
numeric_cols <- sapply(MallCust, is.numeric)
MallCust[, numeric_cols] <- scale(MallCust[, numeric_cols])
# Write Data to WD
write.csv(MallCust, file = "MallCustScaled.csv")
Estimating the
Optimal Number of Clusters
# Plot Graph of Cluster Number vs SSq with corrected theme and linecolor
fviz_nbclust(MallCust, kmeans, method = "wss", linecolor = "red") +
geom_vline(xintercept = 5, size = 1, linetype = 2, color = "darkblue") +
theme(
panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
axis.title.x = element_text(size = 12) # Corrected 'plot.x' to 'axis.title.x'
) +
labs(
title = "Number of Clusters and Respective Sum of Squares",
subtitle = "Based on Mall Customer Data",
caption = "Data source: Public Domain",
x = "Number of Clusters",
y = "Total Within Sum of Squares"
)

Compute Cluster
Indicators
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Calculate Correlations
dist.cor <- get_dist(MallCust, method = "pearson")
# Show a Subset
round(as.matrix(dist.cor)[1:8, 1:8], 1)
## 1 2 3 4 5 6 7 8
## 1 0.0 0.1 0.2 0.7 0.7 0.7 0.9 0.7
## 2 0.1 0.0 0.6 0.2 0.4 0.2 1.2 0.3
## 3 0.2 0.6 0.0 1.3 0.9 1.3 0.4 1.3
## 4 0.7 0.2 1.3 0.0 0.3 0.0 1.4 0.0
## 5 0.7 0.4 0.9 0.3 0.0 0.3 0.7 0.3
## 6 0.7 0.2 1.3 0.0 0.3 0.0 1.4 0.0
## 7 0.9 1.2 0.4 1.4 0.7 1.4 0.0 1.5
## 8 0.7 0.3 1.3 0.0 0.3 0.0 1.5 0.0
# Display Graph
fviz_dist(dist.cor)

Compute Cluster
Details for 4 Clusters
# Calculate Cluster Data
km4 <- kmeans(MallCust, centers = 4, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
print(km4)
## K-means clustering with 4 clusters of sizes 57, 55, 48, 40
##
## Cluster means:
## Gender Age AnnualIncome SpendingScore
## 1 -0.8842 -0.7453 -0.03401 0.6771
## 2 -0.8842 0.6628 -0.06632 -0.5971
## 3 1.1253 0.7579 0.07069 -0.8129
## 4 1.1253 -0.7588 0.05483 0.8316
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 4 4 2 1 1 1 2 1 3 1 3 1 2 1 3 4 2 4 3 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 4 2 4 2 4 2 4 2 1 3 1 3 4 2 1 2 1 2 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 4 3 1 2 1 2 1 1 1 2 4 1 3 2 3 2 3 1 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 4 2 2 3 4 2 2 4 1 3 2 2 2 3 4 2 3 1 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 4 3 2 1 3 2 1 1 2 2 4 3 2 1 4 2 1 3 4
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 3 4 3 1 2 3 3 3 3 1 2 4 1 1 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 4 2 1 4 1 1 3 4 3 4 3 4 1 1 3 1 2 4 3 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 4 1 1 3 4 3 1 2 4 3 4 2 1 2 1 3 1 3 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 1 3 1 3 1 3 1 2 4 3 4 3 4 2 1 3 4 3 4
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 1 3 1 2 4 2 4 2 1 2 1 3 1 2 1 2 4 3 4
##
## Within cluster sum of squares by cluster:
## [1] 94.92 100.72 115.12 74.02
## (between_SS / total_SS = 51.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(MallCustRaw, Cluster = km4$cluster)
# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
cat("\nCluster", i, ":\n")
print(clustered_data[clustered_data$Cluster == i, ])
cat("\n")
}
##
## Cluster 1 :
## Gender Age AnnualIncome SpendingScore Cluster
## 4 Female 23 16 77 1
## 5 Female 31 17 40 1
## 6 Female 22 17 76 1
## 8 Female 23 18 94 1
## 10 Female 30 19 72 1
## 12 Female 35 19 99 1
## 14 Female 24 20 77 1
## 20 Female 35 23 98 1
## 30 Female 23 29 87 1
## 32 Female 21 30 73 1
## 36 Female 21 33 81 1
## 38 Female 30 34 73 1
## 40 Female 20 37 75 1
## 44 Female 31 39 61 1
## 46 Female 24 39 65 1
## 48 Female 27 40 47 1
## 49 Female 29 40 42 1
## 50 Female 31 40 42 1
## 53 Female 31 43 54 1
## 59 Female 27 46 51 1
## 70 Female 32 48 47 1
## 79 Female 23 54 52 1
## 85 Female 21 54 57 1
## 88 Female 22 57 55 1
## 89 Female 34 58 60 1
## 95 Female 32 60 42 1
## 98 Female 27 60 50 1
## 101 Female 23 62 41 1
## 106 Female 21 62 42 1
## 112 Female 19 63 54 1
## 115 Female 18 65 48 1
## 116 Female 19 65 50 1
## 123 Female 40 69 58 1
## 125 Female 23 70 29 1
## 126 Female 31 70 77 1
## 133 Female 25 72 34 1
## 134 Female 31 72 71 1
## 136 Female 29 73 88 1
## 140 Female 35 74 72 1
## 143 Female 28 76 40 1
## 144 Female 32 76 87 1
## 148 Female 32 77 74 1
## 154 Female 38 78 76 1
## 156 Female 27 78 89 1
## 158 Female 30 78 78 1
## 160 Female 30 78 73 1
## 162 Female 29 79 83 1
## 164 Female 31 81 93 1
## 166 Female 36 85 75 1
## 168 Female 33 86 95 1
## 176 Female 30 88 86 1
## 182 Female 32 97 86 1
## 184 Female 29 98 88 1
## 190 Female 36 103 85 1
## 192 Female 32 103 69 1
## 194 Female 38 113 91 1
## 196 Female 35 120 79 1
##
##
## Cluster 2 :
## Gender Age AnnualIncome SpendingScore Cluster
## 3 Female 20 16 6 2
## 7 Female 35 18 6 2
## 13 Female 58 20 15 2
## 17 Female 35 21 35 2
## 23 Female 46 25 5 2
## 25 Female 54 28 14 2
## 27 Female 45 28 32 2
## 29 Female 40 29 31 2
## 35 Female 49 33 14 2
## 37 Female 42 34 17 2
## 39 Female 36 37 26 2
## 41 Female 65 38 35 2
## 45 Female 49 39 28 2
## 47 Female 50 40 55 2
## 51 Female 49 42 52 2
## 55 Female 50 43 45 2
## 57 Female 51 44 50 2
## 63 Female 67 47 52 2
## 64 Female 54 47 59 2
## 67 Female 43 48 50 2
## 68 Female 68 48 48 2
## 72 Female 47 49 42 2
## 73 Female 60 50 49 2
## 74 Female 60 50 56 2
## 77 Female 45 54 53 2
## 80 Female 49 54 42 2
## 84 Female 46 54 44 2
## 87 Female 55 57 58 2
## 90 Female 50 58 46 2
## 91 Female 68 59 55 2
## 94 Female 40 60 40 2
## 97 Female 47 60 47 2
## 102 Female 49 62 48 2
## 107 Female 66 63 50 2
## 113 Female 38 64 42 2
## 117 Female 63 65 43 2
## 118 Female 49 65 59 2
## 119 Female 51 67 43 2
## 120 Female 50 67 57 2
## 122 Female 38 67 40 2
## 137 Female 44 73 7 2
## 141 Female 57 75 5 2
## 149 Female 34 78 22 2
## 153 Female 44 78 20 2
## 155 Female 47 78 16 2
## 161 Female 56 79 35 2
## 169 Female 36 87 27 2
## 175 Female 52 88 13 2
## 181 Female 37 97 32 2
## 185 Female 41 99 39 2
## 187 Female 54 101 24 2
## 189 Female 41 103 17 2
## 191 Female 34 103 23 2
## 195 Female 47 120 16 2
## 197 Female 45 126 28 2
##
##
## Cluster 3 :
## Gender Age AnnualIncome SpendingScore Cluster
## 9 Male 64 19 3 3
## 11 Male 67 19 14 3
## 15 Male 37 20 13 3
## 19 Male 52 23 29 3
## 21 Male 35 24 35 3
## 31 Male 60 30 4 3
## 33 Male 53 33 4 3
## 43 Male 48 39 36 3
## 54 Male 59 43 60 3
## 56 Male 47 43 41 3
## 58 Male 69 44 46 3
## 60 Male 53 46 46 3
## 61 Male 70 46 56 3
## 65 Male 63 48 51 3
## 71 Male 70 49 55 3
## 75 Male 59 54 47 3
## 78 Male 40 54 48 3
## 81 Male 57 54 51 3
## 83 Male 67 54 41 3
## 86 Male 48 54 46 3
## 93 Male 48 60 49 3
## 99 Male 48 61 42 3
## 103 Male 67 62 59 3
## 105 Male 49 62 56 3
## 108 Male 54 63 46 3
## 109 Male 68 63 43 3
## 110 Male 66 63 48 3
## 111 Male 65 63 52 3
## 127 Male 43 71 35 3
## 129 Male 59 71 11 3
## 131 Male 47 71 9 3
## 135 Male 20 73 5 3
## 139 Male 19 74 10 3
## 145 Male 25 77 12 3
## 147 Male 48 77 36 3
## 151 Male 43 78 17 3
## 157 Male 37 78 1 3
## 159 Male 34 78 1 3
## 163 Male 19 81 5 3
## 165 Male 50 85 26 3
## 167 Male 42 86 20 3
## 171 Male 40 87 13 3
## 173 Male 36 87 10 3
## 177 Male 58 88 15 3
## 179 Male 59 93 14 3
## 183 Male 46 98 15 3
## 193 Male 33 113 8 3
## 199 Male 32 137 18 3
##
##
## Cluster 4 :
## Gender Age AnnualIncome SpendingScore Cluster
## 1 Male 19 15 39 4
## 2 Male 21 15 81 4
## 16 Male 22 20 79 4
## 18 Male 20 21 66 4
## 22 Male 25 24 73 4
## 24 Male 31 25 73 4
## 26 Male 29 28 82 4
## 28 Male 35 28 61 4
## 34 Male 18 33 92 4
## 42 Male 24 38 92 4
## 52 Male 33 42 60 4
## 62 Male 19 46 55 4
## 66 Male 18 48 59 4
## 69 Male 19 48 59 4
## 76 Male 26 54 54 4
## 82 Male 38 54 55 4
## 92 Male 18 59 41 4
## 96 Male 24 60 52 4
## 100 Male 20 61 49 4
## 104 Male 26 62 55 4
## 114 Male 19 64 46 4
## 121 Male 27 67 56 4
## 124 Male 39 69 91 4
## 128 Male 40 71 95 4
## 130 Male 38 71 75 4
## 132 Male 39 71 75 4
## 138 Male 32 73 73 4
## 142 Male 32 75 93 4
## 146 Male 28 77 97 4
## 150 Male 34 78 90 4
## 152 Male 39 78 88 4
## 170 Male 32 87 63 4
## 172 Male 28 87 75 4
## 174 Male 36 87 92 4
## 178 Male 27 88 69 4
## 180 Male 35 93 90 4
## 186 Male 30 99 97 4
## 188 Male 28 101 68 4
## 198 Male 32 126 74 4
## 200 Male 30 137 83 4
# List Means with Original Data
aggregate(MallCustRaw, by = list(cluster = km4$cluster), mean)
## cluster Gender Age AnnualIncome SpendingScore
## 1 1 NA 28.44 59.67 67.68
## 2 2 NA 48.11 58.82 34.78
## 3 3 NA 49.44 62.42 29.21
## 4 4 NA 28.25 62.00 71.67
Compute Cluster
Details for 5 Clusters
# Calculate Cluster Data
km5 <- kmeans(MallCust, centers = 5, nstart = 25)
cat("\nK-means Clustering Overview:\n\n")
##
## K-means Clustering Overview:
print(km5)
## K-means clustering with 5 clusters of sizes 40, 56, 42, 31, 31
##
## Cluster means:
## Gender Age AnnualIncome SpendingScore
## 1 1.1253 -0.7588 0.054826 0.8316
## 2 -0.8842 -0.7486 -0.005004 0.6962
## 3 -0.8842 0.7368 -0.541664 -0.4097
## 4 1.1253 1.2208 -0.448731 -0.4412
## 5 0.2178 0.1123 1.120895 -1.3344
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 3 2 3 2 3 2 4 2 4 2 3 2 4 1 3 1 4 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 4 1 3 1 3 1 3 1 3 2 4 2 4 1 3 2 3 2 3 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 1 4 2 3 2 3 2 2 2 3 1 2 4 3 4 3 4 2 4
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 4 1 3 3 4 1 3 3 1 2 4 3 3 3 4 1 3 4 2 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 4 1 4 3 2 4 3 2 2 3 3 1 4 3 2 1 3 2 4 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 3 4 1 4 2 3 4 4 4 4 2 3 1 2 2 3 3 3 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 3 2 1 2 2 4 1 4 1 5 1 2 2 5 2 5 1 5 2
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 5 1 2 2 5 1 4 2 5 1 5 1 5 2 5 2 5 2 5 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 3 2 5 2 5 2 5 2 5 1 5 1 5 1 5 2 5 1 5 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 5 2 5 2 5 1 5 1 5 2 5 2 5 2 5 2 5 1 5 1
##
## Within cluster sum of squares by cluster:
## [1] 74.02 91.03 53.56 40.64 64.21
## (between_SS / total_SS = 59.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Show members of Clusters
# Create a data frame with the original data and the cluster assignments
clustered_data <- data.frame(MallCustRaw, Cluster = km5$cluster)
# Print members of each cluster
for (i in 1:max(clustered_data$Cluster)) {
cat("\nCluster", i, ":\n")
print(clustered_data[clustered_data$Cluster == i, ])
cat("\n")
}
##
## Cluster 1 :
## Gender Age AnnualIncome SpendingScore Cluster
## 1 Male 19 15 39 1
## 2 Male 21 15 81 1
## 16 Male 22 20 79 1
## 18 Male 20 21 66 1
## 22 Male 25 24 73 1
## 24 Male 31 25 73 1
## 26 Male 29 28 82 1
## 28 Male 35 28 61 1
## 34 Male 18 33 92 1
## 42 Male 24 38 92 1
## 52 Male 33 42 60 1
## 62 Male 19 46 55 1
## 66 Male 18 48 59 1
## 69 Male 19 48 59 1
## 76 Male 26 54 54 1
## 82 Male 38 54 55 1
## 92 Male 18 59 41 1
## 96 Male 24 60 52 1
## 100 Male 20 61 49 1
## 104 Male 26 62 55 1
## 114 Male 19 64 46 1
## 121 Male 27 67 56 1
## 124 Male 39 69 91 1
## 128 Male 40 71 95 1
## 130 Male 38 71 75 1
## 132 Male 39 71 75 1
## 138 Male 32 73 73 1
## 142 Male 32 75 93 1
## 146 Male 28 77 97 1
## 150 Male 34 78 90 1
## 152 Male 39 78 88 1
## 170 Male 32 87 63 1
## 172 Male 28 87 75 1
## 174 Male 36 87 92 1
## 178 Male 27 88 69 1
## 180 Male 35 93 90 1
## 186 Male 30 99 97 1
## 188 Male 28 101 68 1
## 198 Male 32 126 74 1
## 200 Male 30 137 83 1
##
##
## Cluster 2 :
## Gender Age AnnualIncome SpendingScore Cluster
## 4 Female 23 16 77 2
## 6 Female 22 17 76 2
## 8 Female 23 18 94 2
## 10 Female 30 19 72 2
## 12 Female 35 19 99 2
## 14 Female 24 20 77 2
## 20 Female 35 23 98 2
## 30 Female 23 29 87 2
## 32 Female 21 30 73 2
## 36 Female 21 33 81 2
## 38 Female 30 34 73 2
## 40 Female 20 37 75 2
## 44 Female 31 39 61 2
## 46 Female 24 39 65 2
## 48 Female 27 40 47 2
## 49 Female 29 40 42 2
## 50 Female 31 40 42 2
## 53 Female 31 43 54 2
## 59 Female 27 46 51 2
## 70 Female 32 48 47 2
## 79 Female 23 54 52 2
## 85 Female 21 54 57 2
## 88 Female 22 57 55 2
## 89 Female 34 58 60 2
## 95 Female 32 60 42 2
## 98 Female 27 60 50 2
## 101 Female 23 62 41 2
## 106 Female 21 62 42 2
## 112 Female 19 63 54 2
## 115 Female 18 65 48 2
## 116 Female 19 65 50 2
## 123 Female 40 69 58 2
## 125 Female 23 70 29 2
## 126 Female 31 70 77 2
## 133 Female 25 72 34 2
## 134 Female 31 72 71 2
## 136 Female 29 73 88 2
## 140 Female 35 74 72 2
## 143 Female 28 76 40 2
## 144 Female 32 76 87 2
## 148 Female 32 77 74 2
## 154 Female 38 78 76 2
## 156 Female 27 78 89 2
## 158 Female 30 78 78 2
## 160 Female 30 78 73 2
## 162 Female 29 79 83 2
## 164 Female 31 81 93 2
## 166 Female 36 85 75 2
## 168 Female 33 86 95 2
## 176 Female 30 88 86 2
## 182 Female 32 97 86 2
## 184 Female 29 98 88 2
## 190 Female 36 103 85 2
## 192 Female 32 103 69 2
## 194 Female 38 113 91 2
## 196 Female 35 120 79 2
##
##
## Cluster 3 :
## Gender Age AnnualIncome SpendingScore Cluster
## 3 Female 20 16 6 3
## 5 Female 31 17 40 3
## 7 Female 35 18 6 3
## 13 Female 58 20 15 3
## 17 Female 35 21 35 3
## 23 Female 46 25 5 3
## 25 Female 54 28 14 3
## 27 Female 45 28 32 3
## 29 Female 40 29 31 3
## 35 Female 49 33 14 3
## 37 Female 42 34 17 3
## 39 Female 36 37 26 3
## 41 Female 65 38 35 3
## 45 Female 49 39 28 3
## 47 Female 50 40 55 3
## 51 Female 49 42 52 3
## 55 Female 50 43 45 3
## 57 Female 51 44 50 3
## 63 Female 67 47 52 3
## 64 Female 54 47 59 3
## 67 Female 43 48 50 3
## 68 Female 68 48 48 3
## 72 Female 47 49 42 3
## 73 Female 60 50 49 3
## 74 Female 60 50 56 3
## 77 Female 45 54 53 3
## 80 Female 49 54 42 3
## 84 Female 46 54 44 3
## 87 Female 55 57 58 3
## 90 Female 50 58 46 3
## 91 Female 68 59 55 3
## 94 Female 40 60 40 3
## 97 Female 47 60 47 3
## 102 Female 49 62 48 3
## 107 Female 66 63 50 3
## 113 Female 38 64 42 3
## 117 Female 63 65 43 3
## 118 Female 49 65 59 3
## 119 Female 51 67 43 3
## 120 Female 50 67 57 3
## 122 Female 38 67 40 3
## 161 Female 56 79 35 3
##
##
## Cluster 4 :
## Gender Age AnnualIncome SpendingScore Cluster
## 9 Male 64 19 3 4
## 11 Male 67 19 14 4
## 15 Male 37 20 13 4
## 19 Male 52 23 29 4
## 21 Male 35 24 35 4
## 31 Male 60 30 4 4
## 33 Male 53 33 4 4
## 43 Male 48 39 36 4
## 54 Male 59 43 60 4
## 56 Male 47 43 41 4
## 58 Male 69 44 46 4
## 60 Male 53 46 46 4
## 61 Male 70 46 56 4
## 65 Male 63 48 51 4
## 71 Male 70 49 55 4
## 75 Male 59 54 47 4
## 78 Male 40 54 48 4
## 81 Male 57 54 51 4
## 83 Male 67 54 41 4
## 86 Male 48 54 46 4
## 93 Male 48 60 49 4
## 99 Male 48 61 42 4
## 103 Male 67 62 59 4
## 105 Male 49 62 56 4
## 108 Male 54 63 46 4
## 109 Male 68 63 43 4
## 110 Male 66 63 48 4
## 111 Male 65 63 52 4
## 127 Male 43 71 35 4
## 129 Male 59 71 11 4
## 147 Male 48 77 36 4
##
##
## Cluster 5 :
## Gender Age AnnualIncome SpendingScore Cluster
## 131 Male 47 71 9 5
## 135 Male 20 73 5 5
## 137 Female 44 73 7 5
## 139 Male 19 74 10 5
## 141 Female 57 75 5 5
## 145 Male 25 77 12 5
## 149 Female 34 78 22 5
## 151 Male 43 78 17 5
## 153 Female 44 78 20 5
## 155 Female 47 78 16 5
## 157 Male 37 78 1 5
## 159 Male 34 78 1 5
## 163 Male 19 81 5 5
## 165 Male 50 85 26 5
## 167 Male 42 86 20 5
## 169 Female 36 87 27 5
## 171 Male 40 87 13 5
## 173 Male 36 87 10 5
## 175 Female 52 88 13 5
## 177 Male 58 88 15 5
## 179 Male 59 93 14 5
## 181 Female 37 97 32 5
## 183 Male 46 98 15 5
## 185 Female 41 99 39 5
## 187 Female 54 101 24 5
## 189 Female 41 103 17 5
## 191 Female 34 103 23 5
## 193 Male 33 113 8 5
## 195 Female 47 120 16 5
## 197 Female 45 126 28 5
## 199 Male 32 137 18 5
# List Means with Original Data
aggregate(MallCust, by = list(cluster = km5$cluster), mean)
## cluster Gender Age AnnualIncome SpendingScore
## 1 1 1.1253 -0.7588 0.054826 0.8316
## 2 2 -0.8842 -0.7486 -0.005004 0.6962
## 3 3 -0.8842 0.7368 -0.541664 -0.4097
## 4 4 1.1253 1.2208 -0.448731 -0.4412
## 5 5 0.2178 0.1123 1.120895 -1.3344
aggregate(MallCustRaw, by = list(cluster = km5$cluster), mean)
## cluster Gender Age AnnualIncome SpendingScore
## 1 1 NA 28.25 62.00 71.67
## 2 2 NA 28.39 60.43 68.18
## 3 3 NA 49.14 46.33 39.62
## 4 4 NA 55.90 48.77 38.81
## 5 5 NA 40.42 90.00 15.74
Perform k-Means
Clustering with k = 4
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Generate k-Means Cluster
km4 <- kmeans(MallCust, centers = 4, nstart = 25)
# Visualize the clusters
fviz_cluster(km4, data = MallCust)

# saving the file
# dev.off()
# output to be present as PNG file
png(file = "KMeansCluster4.png", width = 1000, height = 1000)
# Visualize the clusters
fviz_cluster(km4, data = MallCust,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Mall Customer Data",
subtitle = "Based on Public Domain Data",
caption = "Data source: Public Domain")
# Add Cluster Number to MallCust
MallCust$Cluster4 <- km4$cluster
MallCustRaw$Cluster4 <- km4$cluster
# saving the file
dev.off()
## png
## 2
# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust4.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw4.csv"))
Perform k-Means
Clustering with k = 5
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Visualize the clusters
p <- fviz_cluster(km5, data = MallCust,
palette = c("#2E9FDF", "#E7B800", "#FC4E07", "green", "brown"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 22),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 12),
plot.x = element_text(size = 0),
legend.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 12),
axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.title.x = element_text(size = 14, colour = "black"),
axis.title.y = element_text(size = 14, colour = "black"),
legend.position = "top"
) +
labs(title = "Mall Customer Segments",
subtitle = "Clustered by Age, Gender, Annual Income and Spending",
caption = "Data source: Public Domain"
)
p

# Save the plot
ggsave(filename = paste("Mall Customer Segments", ".png", sep = ""), plot = p, width = 12, height = 9)
# Add Cluster Number to MallCust
MallCust$Segment <- km5$cluster
MallCustRaw$Segment <- km5$cluster
# saving the file
# dev.off()
# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust5.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw5.csv"))
Understand Customer
Segments
# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
# Describe customer Segments
segment_profiles <- MallCustRaw %>%
group_by(Segment) %>%
summarise(
Avg_Age = mean(Age),
Avg_Income = mean(AnnualIncome),
Avg_Spending_Score = mean(SpendingScore),
Gender_Distribution = list(table(Gender))
)
# Visualise Segments
p <- ggplot(MallCustRaw, aes(x = Segment, fill = Gender)) +
geom_bar(position = "dodge") +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 22),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 12),
plot.x = element_text(size = 0),
legend.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 12),
axis.text.x = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.text.y = element_text(face="bold", color="#993333", size = 14, angle = 0),
axis.title.x = element_text(size = 14, colour = "black"),
axis.title.y = element_text(size = 14, colour = "black"),
) +
labs(title = "Customer Segments by Gender",
subtitle = "Based on Public Domain Data ",
caption = "Data source: Public Domain",
x = "Customer Segment",
y = "Customers",
fill = "Segment")
p

# Save the plot
ggsave(filename = paste("Customer Segments by Gender", ".png", sep = ""), plot = p, width = 12, height = 8)
# Write Data to WD
# write.csv(segment_profiles, file = paste(today, "SegmentProfiles5.csv"))
Perform k-Means
Clustering with k = 6
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# output to be present as PNG file
# png(file = "KMeansCluster.png")
# Generate k-Means Cluster
km6 <- kmeans(MallCust, centers = 6, nstart = 25)
# Visualize the clusters
fviz_cluster(km6, data = MallCust)

# Visualize the clusters
fviz_cluster(km6, data = MallCust,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07", "green", "brown"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
) +
theme(panel.grid.major = element_line(color = "grey", size = 0.5),
panel.grid.minor = element_line(color = "lightgrey", size = 0.25),
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
) +
labs(title = "Clusters Plot of Mall Customer Data",
subtitle = "Based on Public Domain Data ",
caption = "Data source: Public Domain")

# Add Cluster Number to MallCust
MallCust$Cluster6 <- km6$cluster
MallCustRaw$Cluster6 <- km6$cluster
# saving the file
# dev.off()
# Set Working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
# Write Data to WD
write.csv(MallCust, file = paste(today, "MallCustScaledClust6.csv"))
write.csv(MallCustRaw, file = paste(today, "MallCustRaw6.csv"))
Check Whether
Clusters Are Valid
# Load Library if not available
if(! "fdm2id" %in% installed.packages()) { install.packages("fdm2id", dependencies = TRUE) }
library(fdm2id)
# Run pseudo-F and R-square for Clustering
cat("Pseudo-F and R-square should both be maximised. \nThe pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.\n\n")
## Pseudo-F and R-square should both be maximised.
## The pseudo-F statistic is a measure of cluster validity, and higher values indicate better-defined clusters. It compares the variance between clusters to the variance within clusters, so a higher pseudo-F value suggests that the clusters are well-separated and compact.
cat("The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.\n\n")
## The R-square value represents the proportion of the total variance in the data that is explained by the clustering. A higher R-square value indicates that the clusters are capturing more of the variance in the data, which generally means better clustering performance.
KM4 <- KMEANS(
MallCust,
k = 5,
criterion = "pseudo-F",
graph = TRUE,
nstart = 10
)
