1 Load Necessary Libraries and Make Some Settings


# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
if(! "DT" %in% installed.packages()) { install.packages("DT", dependencies = TRUE) }
library(DT)


# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("C:/Users/uwe/OneDrive/Documents/AD Data/TextMining/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Get Data and Add ID

Purpose: To ensure all data is ready for analysis without missing or incompatible entries. Explanation: This phase involves gathering and loading the data into R for processing. The dataset, presumably containing text data like customer comments or feedback, was imported into the R environment using read.csv(). Any encoding issues were handled to ensure compatibility. Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.


# Get Your Text Data and Save Them in TextData

# Add ID
TextData    <- TextDataRaw[, -2]
TextData    <- TextData[order(TextData$EnqDate),]
TextData$ID <- seq.int(nrow(TextData))

datatable(head(TextData, 12),  options = list(pageLength = 12))

# Write Data to WD
write.csv(TextDataRaw, file = paste(today, "TextDataRaw.csv"))
write.csv(TextData, file = paste(today, "TextData.csv"))

3 Prepare Data

3.1 Clean Data and add Month Column

Purpose: To prepare the text data for analysis by removing noise and ensuring that only meaningful content is retained. Explanation: Non-ASCII characters, punctuation, and numbers were removed.Text was converted to lowercase to ensure consistency. Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.


# Function to check and remove rows where Old and New delivery dates are the same
# TextData <- TextData[!grepl("Old Delivery Date: (.+?) TO \\1 (AM|PM).*New Delivery Date : \\1 TO \\1 (AM|PM)", TextData$Complaint), ]

# Replace "Old Delivery Date" and "New Delivery Date" expressions with an empty string
TextData$Complaint <- gsub(
  "Old Delivery Date:.*?TO.*?(AM|PM|NT|[A-Z]+).*?New Delivery Date :.*?TO.*?(AM|PM|NT|[A-Z]+)",
  "",
  TextData$Complaint,
  ignore.case = TRUE
)

# Replace "msg" with an empty string
TextData$Complaint <- gsub("msg", "", TextData$Complaint, ignore.case = TRUE)

# Handle any leftover instances of "old delivery date" and "new delivery date"
#TextData$Complaint <- gsub("(?i)Old Delivery Date:.*?$|New Delivery Date :.*?$", "", TextData$Complaint, perl = TRUE)

# Trim any leading or trailing whitespace caused by the removal
TextData$Complaint <- trimws(TextData$Complaint)

# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)

# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")

# Attempt to remove any problematic characters, including control and special characters
# TextData <- TextData %>% mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = ""))  # Removes invalid characters

# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)

# Omitting any NA values
TextData <- TextData %>%
  filter(!is.na(Complaint)) %>%
  mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
  mutate(Type = str_to_title(Type)) # Convert to titlecase

# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)

# Extract month (in "Year-Month" format)
TextData <- TextData %>%
  mutate(Month = floor_date(EnqDate, "month"))

# Generate the summary
df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
TextData
Dimensions: 23801 x 9
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Business [character] 1. Division A 2. Division B 16908 (71.0%) 6893 (29.0%) IIIIIIIIIIIIII IIIII 23801 (100.0%) 0 (0.0%)
2 EnqDate [Date] min : 2094-07-03 med : 2095-01-30 max : 2095-07-02 range : 11m 29d 314 distinct values : : : . : : . . . : : : : : : . : : : : : : : : : : 23801 (100.0%) 0 (0.0%)
3 Type [character] 1. Cancel 2. Changes 3. Credit Card 4. Delivery Status 5. Feedback 6. Others 7. Pdo 8. Ssid Change 1618 ( 6.8%) 9318 (39.1%) 26 ( 0.1%) 993 ( 4.2%) 3419 (14.4%) 5732 (24.1%) 2681 (11.3%) 14 ( 0.1%) I IIIIIII 23801 (100.0%) 0 (0.0%)
4 Order [character] 1. (Empty string) 2. - 3. 0 4. 3189452 5. 3189100 6. 3217360 7. 5159916 8. 5160062 9. 5172125 10. 3217115 [ 18620 others ] 795 ( 3.3%) 138 ( 0.6%) 77 ( 0.3%) 38 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 24 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 15 ( 0.1%) 22629 (95.1%) 23801 (100.0%) 0 (0.0%)
5 Complaint [character] 1. info is protected 2. pdo - not in 3. pdo - shifted 4. pdo - shifted. 5. pdo- not in 6. pdo - name_4e6f74 in 7. pdo - closed 8. pdo- name_4e6f74 in 9. pdo - no such person 10. pls update delivery statu [ 15843 others ] 5758 (24.2%) 296 ( 1.2%) 154 ( 0.6%) 75 ( 0.3%) 72 ( 0.3%) 61 ( 0.3%) 58 ( 0.2%) 52 ( 0.2%) 42 ( 0.2%) 40 ( 0.2%) 17193 (72.2%) IIII 23801 (100.0%) 0 (0.0%)
6 Department [character] All NA’s 0 (0.0%) 23801 (100.0%)
7 SubmittedBy [character] 1. USER_54494e4720485549 2. USER_77616e207169 3. USER_46656c69636961 4. USER_4c69687569 5. USER_46454c4958 6. USER_4a6f616e6e6520546861 7. USER_4a6f6e617468616e 8. USER_4873696e67 9. USER_4a61736d696e65204875 10. USER_53554d495441 [ 95 others ] 2245 ( 9.4%) 1865 ( 7.8%) 1443 ( 6.1%) 1290 ( 5.4%) 1239 ( 5.2%) 1055 ( 4.4%) 982 ( 4.1%) 954 ( 4.0%) 819 ( 3.4%) 560 ( 2.4%) 11349 (47.7%) I I I I I 23801 (100.0%) 0 (0.0%)
8 ID [integer] Mean (sd) : 11901 (6871) min < med < max: 1 < 11901 < 23801 IQR (CV) : 11900 (0.6) 23801 distinct values (Integer sequence) : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 23801 (100.0%) 0 (0.0%)
9 Month [Date] min : 2094-07-01 med : 2095-01-01 max : 2095-07-01 range : 1y 0m 0d 13 distinct values : : : . . : . : . : : : : : : : : 23801 (100.0%) 0 (0.0%)

# Show Characteristics of Data Frame
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

3.2 Preprocess Data

Punctuation, numbers, stopwords (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Any comments marked as “no information” or with NA values were excluded.


# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint) 
TextData$Complaint <- removePunctuation(TextData$Complaint) 
TextData$Complaint <- removeNumbers(TextData$Complaint) 

# Remove rows with "No Information" in the Complaint column
TextData <- TextData[TextData$Complaint != "no information", ]

# Change all pdo to order
TextData$Complaint <- gsub("\\bpdo\\b", "order", TextData$Complaint)

# Load stopwords and remove them
EnglishStopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")
EnglishStopwords <- c(EnglishStopwords,'pte', 'ltd')
EnglishStopwordsPattern <- paste0("\\b(", paste(EnglishStopwords, collapse = "|"), ")\\b")

# Remove stopwords from the Complaints column
TextData <- TextData %>%
  mutate(Complaint = str_remove_all(Complaint, EnglishStopwordsPattern))

# Define custom stopwords (e.g., names) and remove them
CustomStopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
CustomStopwordsPattern <- paste0("\\b(", paste(CustomStopwords, collapse = "|"), ")\\b")

# Remove stopwords from the Complaint column
TextData <- TextData %>%
  mutate(Complaint = str_remove_all(Complaint, CustomStopwordsPattern))

date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names

TextData <- TextData %>%
    filter(Complaint != "no information") %>%
    filter(Complaint != "old") %>%
    filter(!str_detect(Complaint, date_pattern)) %>%
    filter(!str_detect(Complaint, name_pattern)) %>%
    filter(!str_detect(Type, "Credit Card")) %>%                        # Remove Type Credit Card
    mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))    

# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
    unnest_tokens(word, Complaint) %>%
    count(word, sort = TRUE)

# Remove extra spaces
TextData <- TextData %>%
  mutate(Complaint = str_squish(Complaint)) 

# Write Data to WD
write.csv(TextData, file = paste(today, "TextDataClean.csv"))
write.csv(TextDatafreq, file = paste(today, "TextDatafreq.csv"))
write.csv(EnglishStopwords, file = paste(today, "EnglishStop.csv"))
write.csv(CustomStopwords, file = paste(today, "CustomStop.csv"))

3.3 Tokenise Data

Purpose: To enable word-level analysis, which is crucial for frequency analysis, sentiment analysis, and topic modeling. Explanation: Text data was split into individual words (tokens) using unnest_tokens(). This step transforms unstructured text into structured data, making it easier to analyse.


# Loading other packages if not available
if(! "textstem" %in% installed.packages()) { install.packages("textstem", dependencies = TRUE) }
library(textstem)

# Tokenise text data and remove stop words
tokens <- TextData %>%
  unnest_tokens(word, Complaint) %>%
  anti_join(stop_words) %>%
  filter(!is.na(word))

# Lemmatise Tokens
tokensOld <- tokens
tokens$word <- lemmatize_words(tokens$word)

# Join tokens with stratified data
tokens <- tokens %>%
  inner_join(TextData %>% select(ID, Business, Month, Type), by = "ID") %>%
  select(-Business.y, -Month.y, -Type.y) %>%
  rename(
        Business = Business.x,
        Month    = Month.x,
        Type     = Type.x
    )

NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")

cat("\nThis dataset includes:\n")

This dataset includes:

cat("- Number of raw comments:", NumRawComments,"\n")
  • Number of raw comments: 23,801
cat("- Number of clean comments:", NumComments,"\n")
  • Number of clean comments: 23,775
cat("- Number of tokens:", NumTokens,"\n")
  • Number of tokens: 154,948
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
  • Comments between: 03 Jul 2094 to 02 Jul 2095

# Generate the summary
df_summary <- dfSummary(tokens)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
tokens
Dimensions: 154948 x 9
Duplicates: 16067

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Business [character] 1. Division A 2. Division B 108023 (69.7%) 46925 (30.3%) IIIIIIIIIIIII IIIIII 154948 (100.0%) 0 (0.0%)
2 EnqDate [Date] min : 2094-07-03 med : 2095-01-29 max : 2095-07-02 range : 11m 29d 314 distinct values : : . . : : . . . . : : : : : : . : : : : : : : : : : 154948 (100.0%) 0 (0.0%)
3 Type [character] 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Others 6. Pdo 7. Ssid Change 10501 ( 6.8%) 70664 (45.6%) 7997 ( 5.2%) 46100 (29.8%) 11464 ( 7.4%) 8152 ( 5.3%) 70 ( 0.0%) I IIIIIIIII I IIIII I I 154948 (100.0%) 0 (0.0%)
4 Order [character] 1. (Empty string) 2. 0 3. - 4. 3189452 5. 7101462612 6. 7101450475 7. 4101079583 8. 3221733 9. 4203001839 10. 1428853 [ 18294 others ] 1709 ( 1.1%) 547 ( 0.4%) 326 ( 0.2%) 145 ( 0.1%) 142 ( 0.1%) 141 ( 0.1%) 128 ( 0.1%) 127 ( 0.1%) 122 ( 0.1%) 117 ( 0.1%) 151444 (97.7%) 154948 (100.0%) 0 (0.0%)
5 Department [character] All NA’s 0 (0.0%) 154948 (100.0%)
6 SubmittedBy [character] 1. USER_54494e4720485549 2. USER_46656c69636961 3. USER_4a6f616e6e6520546861 4. USER_77616e207169 5. USER_46454c4958 6. USER_4a6f6e617468616e 7. USER_4c69687569 8. USER_4a61736d696e65204875 9. USER_4873696e67 10. USER_53554d495441 [ 94 others ] 19587 (12.6%) 10744 ( 6.9%) 9912 ( 6.4%) 9554 ( 6.2%) 8171 ( 5.3%) 7490 ( 4.8%) 6832 ( 4.4%) 6148 ( 4.0%) 6144 ( 4.0%) 4892 ( 3.2%) 65474 (42.3%) II I I I I 154948 (100.0%) 0 (0.0%)
7 ID [integer] Mean (sd) : 11777 (6906) min < med < max: 1 < 11679 < 23801 IQR (CV) : 12069 (0.6) 23146 distinct values : : . : : : . : : . : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 154948 (100.0%) 0 (0.0%)
8 Month [Date] min : 2094-07-01 med : 2095-01-01 max : 2095-07-01 range : 1y 0m 0d 13 distinct values : : : . . : . . : : : : : : : : : : 154948 (100.0%) 0 (0.0%)
9 word [character] 1. change 2. info 3. protect 4. delivery 5. customer 6. line 7. card 8. message 9. recipient 10. assist [ 7709 others ] 6077 ( 3.9%) 5756 ( 3.7%) 5732 ( 3.7%) 5224 ( 3.4%) 3942 ( 2.5%) 2805 ( 1.8%) 2649 ( 1.7%) 2291 ( 1.5%) 2079 ( 1.3%) 2067 ( 1.3%) 116326 (75.1%) 154948 (100.0%) 0 (0.0%)

# Write Data to WD
write.csv(tokens, file = paste(today, "Tokens.csv"))

3.4 Explore Cleaned Data

Purpose: To gain initial insights into the dataset and identify patterns or anomalies. Explanation: Summary Statistics: The dataset’s structure, number of entries, and date range were explored. Visualisation: Bar charts and Pareto charts were used to highlight common trends, such as the frequency of different comment types. Data frame is explored by showing available columns, structure and first five rows of data.


# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Set Date to Date Format
TextData$EnqDate <- as.Date(TextData$EnqDate)
TextData$Month <- as.Date(TextData$Month)

# Generate the summary
# df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
# print(df_summary)

# Show Characteristics of Tokens
knitr::kable(head(tokens), format = "html", caption = "Sample Data from Tokens") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from Tokens
Business EnqDate Type Order Department SubmittedBy ID Month word
Division A 2094-07-03 Changes 1642960 NA USER_544f4b2059552054494e47 1 2094-07-01 bec
Division A 2094-07-03 Changes 1642960 NA USER_544f4b2059552054494e47 1 2094-07-01 hold
Division A 2094-07-03 Feedback 1642821 NA USER_77616e207169 2 2094-07-01 rebill
Division A 2094-07-03 Feedback 1642821 NA USER_77616e207169 2 2094-07-01 account
Division A 2094-07-03 Feedback 1642821 NA USER_77616e207169 2 2094-07-01 account
Division A 2094-07-03 Feedback 1642821 NA USER_77616e207169 2 2094-07-01 account

knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint Department SubmittedBy ID Month
Division A 2094-07-03 Changes 1642960 bec hold order NA USER_544f4b2059552054494e47 1 2094-07-01
Division A 2094-07-03 Feedback 1642821 rebill account account account number amend company fde chinese annual conference methodist church singapore email revised invoice mercy koh methodist org sg NA USER_77616e207169 2 2094-07-01
Division A 2094-07-03 Feedback 1642702 pls check fongyi photo shooting returned immediatiely NA USER_73616e647261 3 2094-07-01
Division A 2094-07-03 Others 1642924 info protected NA USER_77616e207169 4 2094-07-01
Division A 2094-07-03 Others 1639775 info protected NA USER_544f4b2059552054494e47 5 2094-07-01
Division A 2094-07-03 Delivery Status 7101436479 kindly check delivery status today pm delivery called back fee ff feonnesoo gmail NA USER_54494e4720485549 6 2094-07-01

4 Sentiment Analysis by Business, Month and Type Using Bing Lexicon

Purpose: To assess the emotional tone of the text data, identifying areas with strong positive or negative sentiments. Explanation: Sentiment lexicons like Bing or NRC were used to classify words as positive, negative, or associated with specific emotions. Sentiment scores were calculated and visualized over time and by business type. The result of positive and negative sentiments by business and month is printed in bar charts.


# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")

# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
  inner_join(bing_lexicon, by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Business, Month, Type) %>%
  summarise(sentiment_score = sum(score)) %>%
  ungroup()

# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "dodge") +  # Stacked bars for each Business within each Month
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)

# Improved sentiment visualization with stacked bars by sentiment type 
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "stack") +  # Stack positive and negative sentiments in each bar
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)

6 Pareto Chart of Comment Types

The number of comments is stratified by comment types and shown in a Pareto chart.


# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
  count(Type, sort = TRUE) %>%
  mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)

# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +                       # Bars for frequency
  geom_text(aes(label = n), vjust = -0.5, size = 4) +
  geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1),  # Line for cumulative %
    color = "red", size = 1) +
  geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") +  # Points on the line
  geom_text(aes(
    y = cumulative_percentage * max(type_frequency$n) / 100, 
    label = sprintf("%.1f%%", cumulative_percentage)
    ), color = "red", vjust = -0.5, size = 4) +
  scale_y_continuous(
    sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage")  # Secondary y-axis for %
  ) +
  labs(title = "Pareto Chart of Comment Types",
       x = NULL,
       y = "Frequency") +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)

7 Top Words by Business and Type

Purpose: To understand the key terms or themes in the dataset and their relevance across different categories. Explanation: The most frequently occurring words were identified and visualised. The most common words in comments are extracted by business and comment type and displayed in bar charts.


# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
  group_by(Business, Type, word) %>%
  summarise(freq = n()) %>%
  arrange(desc(freq))

# Visualize top words by stratified groups
top_words <- word_freq %>%
  group_by(Business, Type) %>%
  ungroup()

# Filter for top 10 words across all types
top_10_words <- top_words %>%
  group_by(Type) %>%
  slice_max(order_by = freq, n = 10) %>%  # Select the top 10 words by frequency within each Type
  ungroup()

# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
  geom_col(show.legend = TRUE) +
  facet_wrap(~ Type, scales = "free", ncol = 3) +
  coord_flip() +
  labs(title = "Top Words by Business and Type",
       x = "Words", y = "Frequency") +
  scale_x_reordered() +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".jpg", sep = ""), plot = Plot, width = 12, height = 7)

cat("\nShow Top 10 Words Table\n")

Show Top 10 Words Table

knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business Type word freq
Division A Cancel cancel 792
Division B Cancel cancel 539
Division A Cancel customer 384
Division A Cancel reprocess 359
Division A Cancel line 305
Division A Cancel item 238

cat("\nShow Top 10 Terms Table\n")

Show Top 10 Terms Table

knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
new(“standardGeneric”, .Data = function (x, …)
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”),
package = “stats”, group = list(), valueClass = character(0),
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x,
…)
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”,

8 Topic Modelling by Business, Month, and Type Using LDA

Purpose: To uncover the underlying themes or categories within the dataset, providing a high-level view of the content. Explanation: Latent Dirichlet Allocation (LDA) was applied to discover hidden topics within the text. The optimal number of topics was determined using metrics like Griffiths2004 and CaoJuan2009. The most frequent terms within each topic were extracted to assign meaningful labels to topics. ## Identifying the Best Number of Topics for LDA following Arun (2010), CaoJuan (2009), Deveaud (2014), Griffiths (2004) Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.


# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)

# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
  count(ID, word) %>%
  cast_dtm(ID, word, n)

# Identify number of topics using all four indicators (Griffiths2004, CaoJuan2009, Arun2010, Deveaud2014)
LDA_result4 <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result4)


# Identify number of topics using only two indicators (CaoJuan2009, Deveaud2014)
LDA_result2 <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("CaoJuan2009", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: CaoJuan2009… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result2)

8.1 Topic Modeling by Business, Month, and Type Using LDA

Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.


# Define Number of Topics
NumTopics = 4

# Perform LDA with a specified number of topics (e.g., 4 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))

# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")

# Extract top terms per topic and arrange them for display
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%   # Select the top 10 terms with the highest beta values for each topic
  arrange(topic, desc(beta))  # Arrange terms by topic and descending beta values

# Print the terms nicely
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print(n = Inf)  # Print all rows (no truncation)

9 A tibble: 4 × 2

topic terms

1 1 protect, change, customer, info, date, cancel, remove, fd, ce, card
2 2 info, change, delivery, customer, recipient, line, card, address, prote… 3 3 delivery, change, protect, customer, line, message, sender, card, pm, t… 4 4 info, change, delivery, protect, assist, message, customer, check, pm, …


# Write Data to WD
write.csv(top_terms, file = paste(today, "TopTerms.csv"))

# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", ncol = 2) +
  coord_flip() +
  labs(title = "Top Terms per Topic",
       x = "Term",
       y = "Frequency") +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) + 
  scale_x_reordered()

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 16, height = 12)

9.1 Topic Modelling Results

Extract the most frequent terms per topic and business. This should help to assign themes to topics.


# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)

# Format of the resulting object
attributes(tmResult)

$names [1] “terms” “topics”

nTerms(dtm)              # lengthOfVocab

[1] 7719


beta <- tmResult$terms   # get beta from results
dim(beta)   

[1] 4 7719


rowSums(beta)            # rows in beta sum to 1

1 2 3 4 1 1 1 1


# Show 10 Most Frequent Terms per Topic
terms <- terms(lda_model, 40)

# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
                              Topic 1 
  "protect change customer info date" 
                              Topic 2 

“info change delivery customer recipient” Topic 3 “delivery change protect customer line” Topic 4 “info change delivery protect assist”


# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics 
dim(theta)   

[1] 23146 4


# Write Data to WD
write.csv(terms, file = paste(today, "Terms.csv"))

9.2 Visualise Modelling Results

Purpose: To make the results accessible and actionable for stakeholders. Explanation: Various visualisations were created, such as bar charts, word clouds, and topic distributions, to present findings in an easily interpretable format. Each visualisation was tailored to highlight specific insights, such as topic importance or sentiment trends.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud2" %in% installed.packages()) { install.packages("wordcloud2", dependencies = TRUE) }
library(wordcloud2)

# Saving each plot as an image
for(topicToViz in 1:NumTopics)  {
  
  cat("\nWordcloud for topic ", topicToViz, "\n\n")
  
  # Visualize topics as word cloud
  topicToViz <- topicToViz # change for your own topic of interest
  
  # Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
  top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:40]
  words <- names(top40terms)
  
  # Extract the probabilites of each of the 40 terms
  probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
  
  # Create a valid data frame
  wordDF <- data.frame(
    word = words,
    freq = probabilities
  )

  # Visualize the terms as wordcloud
  mycolors <- brewer.pal(8, "Dark2")
  wordcloud2(wordDF, color = mycolors, shape = 'star')

  # Save the plot
  saveWidget(wordcloud2(wordDF), paste(today, "Wordcloud" , topicToViz, ".html", sep = " "), selfcontained = TRUE)


}

Wordcloud for topic 1

Wordcloud for topic 2

Wordcloud for topic 3

Wordcloud for topic 4

9.3 Visualise Topic Distribution within Documents

Identify the distribution of topics in documents, i.e., comments.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)

# Make Examples
exampleIds <- c(55, 62, 1111, 2224)

# Show sample comments
cat("\nShow sample documents, i.e., comments:\n\n")

Show sample documents, i.e., comments:


# Filter rows and select the Complaint column
SelectedComplaints <- TextData %>%
  filter(ID %in% exampleIds) %>%
  select(ID, Complaint)

print(SelectedComplaints)
ID

1 55 2 62 3 1111 4 2224 Complaint 1 typo error message options revamp website rectify wife dear noelgifts wrong sending rest life meaning shouldnt spending rest life 2 customer called requesting deliver earlier hes giving wife surprise bringing dinner receiving bouquet inform customer promises highlight 3 called amend details amend follow ddd db 4 info protected


N <- length(exampleIds)


# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]

colnames(topicProportionExamples) <- topicNames

vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)), 
                     variable.name = "topic", id.vars = "document")  

Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("55" = "lightgreen", "62" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
                    labels = c("Com 55", "Com 62", "Com 1111", "Com 2224", "Com 3334"),
                    name = "Comment") +  # Custom legend title and labels
  facet_wrap(~ document, ncol = 4) +
    labs(title = paste("Topic Proportions in Sample Comments"),
         x = NULL, y = "Topic") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
  ) 

Plot


# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".jpg", sep = ""), plot = Plot, width = 15, height = 9)

10 Emotion Analysis by Business, Month, and Type

The NRC lexicon assigns words to specific emotions such as anger, joy, sadness, and more, along with general positive or negative sentiment. This step ensures that the emotions are defined for words in the tokenized text data. Each word in the dataset is matched with its corresponding emotion. Words are aggregated by Business, Month, and Type. This allows the data to be reshaped, showing how often each emotion appears within different categories. Reshaping the data into a long format is essential for visualization with ggplot2. The Month is also formatted into a standard date type for proper chronological ordering in plots. Using str_to_title() ensures consistent capitalization of emotion labels. Assigning distinct colors for each emotion enhances the readability of the visualisations. Grouping similar emotions (e.g., red shades for negative emotions, green shades for positive ones) could improve clarity.


# Loading other packages if not available
if(! "textdata" %in% installed.packages()) { install.packages("textdata", dependencies = TRUE) }
library(textdata)

# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")

# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
  inner_join(nrc_lexicon, by = "word") %>%
  count(Business, Month, Type, sentiment) %>%
  spread(sentiment, n, fill = 0)

# Reshape data for plotting
emotion_long <- emotions %>%
  gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
  mutate(Month = ymd(Month))  %>%   # Requires lubridate package
  mutate(emotion = str_to_title(emotion))

# Define custom colors for each emotion
emotion_colors <- c(
  "Anger" = "firebrick",
  "Anticipation" = "lightgrey",
  "Disgust" = "brown",
  "Fear" = "tomato",
  "Joy" = "lightgreen",
  "Negative" = "indianred",
  "Positive" = "forestgreen",
  "Sadness" = "salmon",
  "Surprise" = "palegreen",
  "Trust" = "darkolivegreen"
)

# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ Type, scales = "free") +
  labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
       x = NULL, y = "Emotion Count") +
  theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 


# Unique Types in the data
unique_types <- unique(emotion_long$Type)

# Loop over each type and create an independent plot
for (type in unique_types) {
  
  # Filter data for the current Type
  data_subset <- emotion_long %>% filter(Type == type)
  
  # Plot for the current Type
  Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
    geom_bar(stat = "identity", position = "stack") +
    scale_fill_manual(values = emotion_colors, name = "Emotion") +  # Assign custom colors
    labs(title = paste("Emotion Analysis for Type:", type),
         x = NULL, y = "Emotion Count") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

  # Print the plot
  print(Plot) 

  # Save the plot
  ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
  
}