1 Load Necessary Libraries and Make Some Settings


# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("C:/Users/uwe/OneDrive/Documents/AD Data/TextMining/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Get Data and Add ID

Purpose: To ensure all data is ready for analysis without missing or incompatible entries. Explanation: This phase involves gathering and loading the data into R for processing. The dataset, presumably containing text data like customer comments or feedback, was imported into the R environment using read.csv(). Any encoding issues were handled to ensure compatibility. Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.


# Get Your Text Data and Save Them in TextData

# Add ID
TextData    <- TextDataRaw[, -2]
TextData    <- TextData[order(TextData$EnqDate),]
TextData$ID <- seq.int(nrow(TextData))

# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))

3 Prepare Data

3.1 Clean Data and add Month Column

Purpose: To prepare the text data for analysis by removing noise and ensuring that only meaningful content is retained. Explanation: Non-ASCII characters, punctuation, and numbers were removed.Text was converted to lowercase to ensure consistency. Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.


# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }

library(summarytools)
# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")

# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)

# Omitting any NA values
TextData <- TextData %>%
  filter(!is.na(Complaint)) %>%
  mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
  mutate(Type = str_to_title(Type)) # Convert to titlecase

# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)

# Extract month (in "Year-Month" format)
TextData <- TextData %>%
  mutate(Month = floor_date(EnqDate, "month"))

# Generate the summary
df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
TextData
Dimensions: 23801 x 7
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Business [character] 1. Division A 2. Division B 16908 (71.0%) 6893 (29.0%) IIIIIIIIIIIIII IIIII 23801 (100.0%) 0 (0.0%)
2 EnqDate [Date] min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d 314 distinct values : : : . : : . . . : : : : : : . : : : : : : : : : : 23801 (100.0%) 0 (0.0%)
3 Type [character] 1. Cancel 2. Changes 3. Credit Card 4. Delivery Status 5. Feedback 6. Not Delivered 7. Others 1618 ( 6.8%) 9332 (39.2%) 26 ( 0.1%) 993 ( 4.2%) 3419 (14.4%) 2681 (11.3%) 5732 (24.1%) I IIIIIII 23801 (100.0%) 0 (0.0%)
4 Order [character] 1. (Empty string) 2. - 3. 0 4. 3189452 5. 3189100 6. 3217360 7. 5159916 8. 5160062 9. 5172125 10. 3217115 [ 18620 others ] 795 ( 3.3%) 138 ( 0.6%) 77 ( 0.3%) 38 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 24 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 15 ( 0.1%) 22629 (95.1%) 23801 (100.0%) 0 (0.0%)
5 Complaint [character] 1. no information 2. pdo - not in 3. pdo - shifted 4. pdo- not in 5. pdo - shifted. 6. pdo - closed 7. pdo - no such person 8. pls update delivery statu 9. pdo - discharged 10. pdo - closed. [ 16116 others ] 5758 (24.2%) 357 ( 1.5%) 177 ( 0.7%) 124 ( 0.5%) 78 ( 0.3%) 60 ( 0.3%) 47 ( 0.2%) 41 ( 0.2%) 37 ( 0.2%) 35 ( 0.1%) 17087 (71.8%) IIII 23801 (100.0%) 0 (0.0%)
6 ID [integer] Mean (sd) : 11901 (6871) min < med < max: 1 < 11901 < 23801 IQR (CV) : 11900 (0.6) 23801 distinct values (Integer sequence) : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 23801 (100.0%) 0 (0.0%)
7 Month [Date] min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d 13 distinct values : : : . . : . . : . : : : : : : : : 23801 (100.0%) 0 (0.0%)

# Show Characteristics of Data Frame
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

3.2 Preprocess Data

Punctuation, numbers, stopwords (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Any comments marked as “no information” or with NA values were excluded.


# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint) 
TextData$Complaint <- removePunctuation(TextData$Complaint) 
TextData$Complaint <- removeNumbers(TextData$Complaint) 
TextData <- TextData %>%
    filter(Complaint != "no information") %>%
    mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))

# Define custom stopwords (e.g., names)
custom_stopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names

# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
    unnest_tokens(word, Complaint) %>%
    filter(!word %in% custom_stopwords) %>%
    filter(!str_detect(word, date_pattern)) %>%
    filter(!str_detect(word, name_pattern)) %>%
    count(word, sort = TRUE)

3.3 Tokenise Data

Purpose: To enable word-level analysis, which is crucial for frequency analysis, sentiment analysis, and topic modeling. Explanation: Text data was split into individual words (tokens) using unnest_tokens(). This step transforms unstructured text into structured data, making it easier to analyse.


# Loading other packages if not available
if(! "textstem" %in% installed.packages()) { install.packages("textstem", dependencies = TRUE) }
library(textstem)

# Tokenize text data and remove stop words
tokens <- TextData %>%
  unnest_tokens(word, Complaint) %>%
  anti_join(stop_words) %>%
  filter(!is.na(word))

# Lemmatise Tokens
tokensOld <- tokens
tokens$word <- lemmatize_words(tokens$word)

# Join tokens with stratified data
tokens <- tokens %>%
  inner_join(TextData %>% select(ID, Business, Month, Type), by = "ID") %>%
  select(-Business.y, -Month.y, -Type.y) %>%
  rename(
        Business = Business.x,
        Month    = Month.x,
        Type     = Type.x
    )

NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")

cat("\nThis dataset includes:\n")

This dataset includes:

cat("- Number of raw comments:", NumRawComments,"\n")
  • Number of raw comments: 23,801
cat("- Number of clean comments:", NumComments,"\n")
  • Number of clean comments: 18,043
cat("- Number of tokens:", NumTokens,"\n")
  • Number of tokens: 225,957
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
  • Comments between: 07 Jul 2022 to 06 Jul 2023

# Generate the summary
df_summary <- dfSummary(tokens)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
tokens
Dimensions: 225957 x 7
Duplicates: 47293

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Business [character] 1. Division A 2. Division B 160584 (71.1%) 65373 (28.9%) IIIIIIIIIIIIII IIIII 225957 (100.0%) 0 (0.0%)
2 EnqDate [Date] min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d 312 distinct values : : . . : : . . . : : : : : . : : : : : : : : : : 225957 (100.0%) 0 (0.0%)
3 Type [character] 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered 11675 ( 5.2%) 142776 (63.2%) 9029 ( 4.0%) 50963 (22.6%) 11514 ( 5.1%) I IIIIIIIIIIII 225957 (100.0%) 0 (0.0%)
4 Order [character] 1. 0 2. 3189452 3. (Empty string) 4. 4203001839 5. 7101450475 6. 7101462612 7. 7101452379 8. 3221733 9. 4101079583 10. 3217266 [ 15067 others ] 515 ( 0.2%) 220 ( 0.1%) 173 ( 0.1%) 159 ( 0.1%) 153 ( 0.1%) 153 ( 0.1%) 135 ( 0.1%) 134 ( 0.1%) 133 ( 0.1%) 129 ( 0.1%) 224053 (99.2%) 225957 (100.0%) 0 (0.0%)
5 ID [integer] Mean (sd) : 11921 (6835) min < med < max: 3 < 11909 < 23787 IQR (CV) : 11789 (0.6) 17990 distinct values . . . . : : . . . . : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 225957 (100.0%) 0 (0.0%)
6 Month [Date] min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d 13 distinct values : : : . : . . : . : : : : : : : : 225957 (100.0%) 0 (0.0%)
7 word [character] 1. delivery 2. date 3. msg 4. pm 5. change 6. customer 7. not 8. message 9. line 10. card [ 11179 others ] 24079 (10.7%) 20477 ( 9.1%) 9958 ( 4.4%) 7649 ( 3.4%) 6914 ( 3.1%) 3956 ( 1.8%) 3771 ( 1.7%) 3307 ( 1.5%) 2810 ( 1.2%) 2769 ( 1.2%) 140267 (62.1%) II I 225957 (100.0%) 0 (0.0%)

3.4 Explore Cleaned Data

Purpose: To gain initial insights into the dataset and identify patterns or anomalies. Explanation: Summary Statistics: The dataset’s structure, number of entries, and date range were explored. Visualisation: Bar charts and Pareto charts were used to highlight common trends, such as the frequency of different comment types. Data frame is explored by showing available columns, structure and first five rows of data.


# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Attempt to remove any problematic characters, including control and special characters
TextData <- TextData %>%
  mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = ""))  # Removes invalid characters

TextData$EnqDate <- as.Date(TextData$EnqDate)
TextData$Month <- as.Date(TextData$Month)

# Generate the summary
df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
TextData
Dimensions: 18043 x 7
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Business [character] 1. Division A 2. Division B 12671 (70.2%) 5372 (29.8%) IIIIIIIIIIIIII IIIII 18043 (100.0%) 0 (0.0%)
2 EnqDate [Date] min : 2022-07-07 med : 2023-02-04 max : 2023-07-06 range : 11m 29d 312 distinct values : : . . : : . : : : . : . : : : : : : : : : : 18043 (100.0%) 0 (0.0%)
3 Type [character] 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered 1618 ( 9.0%) 9332 (51.7%) 993 ( 5.5%) 3419 (18.9%) 2681 (14.9%) I IIIIIIIIII I III II 18043 (100.0%) 0 (0.0%)
4 Order [character] 1. 3189452 2. 0 3. 3189100 4. 3217360 5. 5159916 6. 5160062 7. (Empty string) 8. 5172125 9. 3212425 10. 3221869 [ 15086 others ] 38 ( 0.2%) 32 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 23 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 17 ( 0.1%) 14 ( 0.1%) 14 ( 0.1%) 17820 (98.8%) 18043 (100.0%) 0 (0.0%)
5 Complaint [character] 1. pdo not in 2. pdo shifted 3. pdo closed 4. pdo no such person 5. pdo cannot locate 6. amount should be sap capt 7. pdo no such address 8. pdo discharged 9. not in 10. pdo not in x [ 15293 others ] 539 ( 3.0%) 329 ( 1.8%) 123 ( 0.7%) 73 ( 0.4%) 72 ( 0.4%) 64 ( 0.4%) 57 ( 0.3%) 49 ( 0.3%) 48 ( 0.3%) 44 ( 0.2%) 16645 (92.3%) 18043 (100.0%) 0 (0.0%)
6 ID [character] 1. 10 2. 1000 3. 10000 4. 10001 5. 10002 6. 10003 7. 10004 8. 10005 9. 10006 10. 10007 [ 18033 others ] 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 18033 (99.9%) 18043 (100.0%) 0 (0.0%)
7 Month [Date] min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d 13 distinct values : : : . : : . . : : : : : : : 18043 (100.0%) 0 (0.0%)

# Show Characteristics of Tokens
knitr::kable(head(tokens), format = "html", caption = "Sample Data from Tokens") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from Tokens
Business EnqDate Type Order ID Month word
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 delivery
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 date
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 delivery
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 date
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 msg
Division A 2022-07-07 Changes 7101436529 3 2022-07-01 kindly

# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

4 Sentiment Analysis by Business, Month and Type Using Bing Lexicon

Purpose: To assess the emotional tone of the text data, identifying areas with strong positive or negative sentiments. Explanation: Sentiment lexicons like Bing or NRC were used to classify words as positive, negative, or associated with specific emotions. Sentiment scores were calculated and visualized over time and by business type. The result of positive and negative sentiments by business and month is printed in bar charts.


# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")

# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
  inner_join(bing_lexicon, by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Business, Month, Type) %>%
  summarise(sentiment_score = sum(score)) %>%
  ungroup()

# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "dodge") +  # Stacked bars for each Business within each Month
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)

# Improved sentiment visualization with stacked bars by sentiment type 
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "stack") +  # Stack positive and negative sentiments in each bar
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)

6 Pareto Chart of Comment Types

The number of comments is stratified by comment types and shown in a Pareto chart.


# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
  count(Type, sort = TRUE) %>%
  mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)

# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +                       # Bars for frequency
  geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1),  # Line for cumulative %
            color = "red", size = 1) +
  geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") +  # Points on the line
  scale_y_continuous(
    sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage")  # Secondary y-axis for %
  ) +
  labs(title = "Pareto Chart of Comment Types",
       x = NULL,
       y = "Frequency") +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)

7 Word Frequency Analysis by Business, Month, and Type

Purpose: To understand the key terms or themes in the dataset and their relevance across different categories. Explanation: The most frequently occurring words were identified and visualised. The most common words in comments are extracted by business and comment type and displayed in bar charts.


# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
  group_by(Business, Type, word) %>%
  summarise(freq = n()) %>%
  arrange(desc(freq))

# Visualize top words by stratified groups
top_words <- word_freq %>%
  group_by(Business, Type) %>%
  ungroup()

# Filter for top 10 words across all types
top_10_words <- top_words %>%
  group_by(Type) %>%
  slice_max(order_by = freq, n = 10) %>%  # Select the top 10 words by frequency within each Type
  ungroup()

# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
  geom_col(show.legend = TRUE) +
  facet_wrap(~ Type, scales = "free", ncol = 2) +
  coord_flip() +
  labs(title = "Top Words by Business and Type",
       x = "Words", y = "Frequency") +
  scale_x_reordered() +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)

cat("\nShow Top 10 Words Table\n")

Show Top 10 Words Table

knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business Type word freq
Division A Cancel cancel 829
Division B Cancel cancel 559
Division A Cancel customer 384
Division A Cancel reprocess 359
Division A Cancel line 305
Division A Cancel item 240

cat("\nShow Top 10 Terms Table\n")

Show Top 10 Terms Table

knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
new(“standardGeneric”, .Data = function (x, …)
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”),
package = “stats”, group = list(), valueClass = character(0),
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x,
…)
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”,

8 Topic Modelling by Business, Month, and Type Using LDA

Purpose: To uncover the underlying themes or categories within the dataset, providing a high-level view of the content. Explanation: Latent Dirichlet Allocation (LDA) was applied to discover hidden topics within the text. The optimal number of topics was determined using metrics like Griffiths2004 and CaoJuan2009. The most frequent terms within each topic were extracted to assign meaningful labels to topics. ## Identifying the Best Number of Topics for LDA following Arun (2010), CaoJuan (2009), Deveaud (2014), Griffiths (2004) Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.


# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)

# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
  count(ID, word) %>%
  cast_dtm(ID, word, n)

# Identify number of topics
LDA_result <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result)

8.1 Topic Modeling by Business, Month, and Type Using LDA

Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.


# Define Number of Topics
NumTopics = 6

# Perform LDA with a specified number of topics (e.g., 3 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))

# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup()

# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", ncol = 3) +
  coord_flip() +
  labs(title = "Top Terms per Topic",
       x = "Term",
       y = "Frequency") +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) + 
  scale_x_reordered()

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 15, height = 10)

8.2 Topic Modelling Results

Extract the most frequent terms per topic and business. This should help to assign themes to topics.


# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)

# Format of the resulting object
attributes(tmResult)

$names [1] “terms” “topics”

nTerms(dtm)              # lengthOfVocab

[1] 11189


beta <- tmResult$terms   # get beta from results
dim(beta)   

[1] 6 11189


rowSums(beta)            # rows in beta sum to 1

1 2 3 4 5 6 1 1 1 1 1 1


# Show 10 Most Frequent Terms per Topic
terms(lda_model, 10)
  Topic 1     Topic 2    Topic 3    Topic 4     Topic 5     Topic 6      

[1,] “delivery” “message” “customer” “call” “delivery” “delivery”
[2,] “date” “date” “cancel” “check” “date” “date”
[3,] “line” “delivery” “invoice” “customer” “pm” “not”
[4,] “msg” “card” “refund” “recipient” “msg” “pdo”
[5,] “address” “msg” “kindly” “sender” “change” “msg”
[6,] “change” “change” “charge” “status” “time” “change”
[7,] “remove” “family” “item” “delivery” “recipient” “instruction” [8,] “pte” “wish” “due” “deliver” “assist” “add”
[9,] “street” “happy” “amount” “hamper” “sender” “shift”
[10,] “singapore” “dear” “assist” “receive” “pl” “special”


# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
                            Topic 1                                 Topic 2 
   "delivery date line msg address"        "message date delivery card msg" 
                            Topic 3                                 Topic 4 

“customer cancel invoice refund kindly” “call check customer recipient sender” Topic 5 Topic 6 “delivery date pm msg change” “delivery date not pdo msg”


# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics 
dim(theta)   

[1] 17990 6

8.3 Visualise Modelling Results

Purpose: To make the results accessible and actionable for stakeholders. Explanation: Various visualisations were created, such as bar charts, word clouds, and topic distributions, to present findings in an easily interpretable format. Each visualisation was tailored to highlight specific insights, such as topic importance or sentiment trends.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)

# Saving each plot as an image
for(topicToViz in 1:NumTopics)  {
  
  # Visualize topics as word cloud
  topicToViz <- topicToViz # change for your own topic of interest
  
  # Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
  top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:50]
  words <- names(top40terms)
  
  # Extract the probabilites of each of the 40 terms
  probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:50]
  
  # Visualize the terms as wordcloud
  mycolors <- brewer.pal(8, "Dark2")
  wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

  # Save the plot
  # ggsave(filename = paste("Word Cloud ", wordcloud, ".png", sep = ""), plot = p, width = 8, height = 5)
  
}

8.4 Visualise Topic Distribution within Documents

Identify the distribution of topics in documents, i.e., comments.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)

# Convert TextData$Complaint to a data frame if it's not already
textdata <- data.frame(text = TextData$Complaint, doc_id = TextData$ID, stringsAsFactors = FALSE)

# Load stopwords
english_stopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")

# Create corpus object
corpus <- Corpus(DataframeSource(textdata))

# Preprocessing chain
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, english_stopwords)
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)

# Make Examples
exampleIds <- c(2, 222, 1111, 2224, 3334)

# Show sample comments
# cat("\nShow sample documents, i.e., comments:\n\n")
# lapply(corpus[exampleIds], as.character)

N <- length(exampleIds)


# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]

colnames(topicProportionExamples) <- topicNames

vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)), 
                     variable.name = "topic", id.vars = "document")  

Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("2" = "lightgreen", "222" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
                    labels = c("Com 2", "Com 222", "Com 1111", "Com 2224", "Com 3334"),
                    name = "Comment") +  # Custom legend title and labels
  facet_wrap(~ document, ncol = 5) +
    labs(title = paste("Topic Proportions in Sample Comments"),
         x = NULL, y = "Topic") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
  ) 

# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)

9 Emotion Analysis by Business, Month, and Type

The NRC lexicon assigns words to specific emotions such as anger, joy, sadness, and more, along with general positive or negative sentiment. This step ensures that the emotions are defined for words in the tokenized text data. Each word in the dataset is matched with its corresponding emotion. Words are aggregated by Business, Month, and Type. This allows the data to be reshaped, showing how often each emotion appears within different categories. Reshaping the data into a long format is essential for visualization with ggplot2. The Month is also formatted into a standard date type for proper chronological ordering in plots. Using str_to_title() ensures consistent capitalization of emotion labels. Assigning distinct colors for each emotion enhances the readability of the visualisations. Grouping similar emotions (e.g., red shades for negative emotions, green shades for positive ones) could improve clarity.


# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")

# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
  inner_join(nrc_lexicon, by = "word") %>%
  count(Business, Month, Type, sentiment) %>%
  spread(sentiment, n, fill = 0)

# Reshape data for plotting
emotion_long <- emotions %>%
  gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
  mutate(Month = ymd(Month))  %>%   # Requires lubridate package
  mutate(emotion = str_to_title(emotion))

# Define custom colors for each emotion
emotion_colors <- c(
  "Anger" = "firebrick",
  "Anticipation" = "lightgrey",
  "Disgust" = "brown",
  "Fear" = "tomato",
  "Joy" = "lightgreen",
  "Negative" = "indianred",
  "Positive" = "forestgreen",
  "Sadness" = "salmon",
  "Surprise" = "palegreen",
  "Trust" = "darkolivegreen"
)

# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ Type, scales = "free") +
  labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
       x = NULL, y = "Emotion Count") +
  theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 


# Unique Types in the data
unique_types <- unique(emotion_long$Type)

# Loop over each type and create an independent plot
for (type in unique_types) {
  
  # Filter data for the current Type
  data_subset <- emotion_long %>% filter(Type == type)
  
  # Plot for the current Type
  Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
    geom_bar(stat = "identity", position = "stack") +
    scale_fill_manual(values = emotion_colors, name = "Emotion") +  # Assign custom colors
    labs(title = paste("Emotion Analysis for Type:", type),
         x = NULL, y = "Emotion Count") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

  # Print the plot
  print(Plot) 

  # Save the plot
  ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
  
}