1 Load Necessary Libraries and Make Some Settings


# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)


# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("~/AD Data/TextMining/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Get Data and Add ID

Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.


# Add ID
TextData <- TextData[, -2]
TextData <- TextData[order(TextData$EnqDate),]
TextData$Complaint_ID <- seq.int(nrow(TextData))

# Save Original
TextDataRaw <- TextData

# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))

3 Prepare Data

3.1 Explore Data

Data frame is explored by showing available columns, structure and first five rows of data.


# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Show Characteristics of Data Frame
cat("\nShow First Rows of Data Table")

Show First Rows of Data Table


TextData <- TextData %>% mutate_all(~iconv(., from = "", to = "UTF-8", sub = ""))
knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint Complaint_ID
Division A 2020-07-01 OTHERS 4151596/4151811/4152515/4153389/4153473 No information 1
Division A 2020-07-01 OTHERS 1627628/4145180/4146985/4147637 No information 2
Division A 2020-07-01 CHANGES 7101436529

Old Delivery Date: 02/07/2020 TO 02/07/2020 AM New Delivery Date : 02/07/2020 TO 02/07/2020 AM

3.2 Msg :

kindly assist to amend the message and add in Ǿ϶Ǿ?ǾǾ?

Ǿ϶Ǿ?ǾǾ? ?݃??Ǿ?ݶǸ?
3
Division A 2020-07-01 CHANGES 7101436521

Old Delivery Date: 01/07/2020 TO 01/07/2020 AT New Delivery Date : 01/07/2020 TO 01/07/2020 AT

3.3 Msg :

Please change the Father to One as below

With Deepest Sympathy & Heartfelt Condolences On The Demise Of Your Beloved One
4
Division A 2020-07-01 DELIVERY STATUS 7101436479

kindly check delivery status for today PM delivery and called back. Thank you

5
Division A 2020-07-01 DELIVERY STATUS 7101436235 Recipient husband called,claimed was told by wife suppose to receive item from us however till now yet to receive. Mentioned did receive several missed call but not sure is it from us. Please assist to check & revert to Recipient husband. In system 1st attempt was failed but subsequent delivery status 4 but custs never receive. 6

3.4 Clean Data and add Month Column

Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.


# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)

# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")

# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)

# Omitting any NA values
TextData <- TextData %>%
  filter(!is.na(Complaint)) %>%
  mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
  mutate(Type = str_to_title(Type)) # Convert to titlecase

# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)

# Extract month (in "Year-Month" format)
TextData <- TextData %>%
  mutate(Month = floor_date(EnqDate, "month"))

# Show Characteristics of Data Frame
cat("\nShow First Rows of Data Table\n")

Show First Rows of Data Table

knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint Complaint_ID Month
Division A 2020-07-01 Others 4151596/4151811/4152515/4153389/4153473 no information 1 2020-07-01
Division A 2020-07-01 Others 1627628/4145180/4146985/4147637 no information 2 2020-07-01
Division A 2020-07-01 Changes 7101436529

old delivery date: 02/07/2020 to 02/07/2020 am new delivery date : 02/07/2020 to 02/07/2020 am

3.5 msg :

kindly assist to amend the message and add in ??

?? ?????
3 2020-07-01
Division A 2020-07-01 Changes 7101436521

old delivery date: 01/07/2020 to 01/07/2020 at new delivery date : 01/07/2020 to 01/07/2020 at

3.6 msg :

please change the father to one as below

with deepest sympathy & heartfelt condolences on the demise of your beloved one
4 2020-07-01
Division A 2020-07-01 Delivery Status 7101436479

Kindly check delivery status for today pm delivery and called back.

Thank you
5 2020-07-01
Division A 2020-07-01 Delivery Status 7101436235 recipient husband called, claimed was told by wife suppose to receive item from us however till now yet to receive. mentioned did receive several missed call but not sure is it from us. please assist to check & revert to recipient husband. in system 1st attempt was failed but subsequent delivery status 4 but custs never receive. 6 2020-07-01

3.7 Preprocess and Tokenise Data

Punctuation, numbers, stop words (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Empty comments and comments containing “no information” are omitted.


# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint) 
TextData$Complaint <- removePunctuation(TextData$Complaint) 
TextData$Complaint <- removeNumbers(TextData$Complaint) 
TextData <- TextData %>%
    filter(Complaint != "no information") %>%
    mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))

# Define custom stopwords (e.g., names)
custom_stopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names

# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
    unnest_tokens(word, Complaint) %>%
    filter(!word %in% custom_stopwords) %>%
    filter(!str_detect(word, date_pattern)) %>%
    filter(!str_detect(word, name_pattern)) %>%
    count(word, sort = TRUE)

# Tokenize text data and remove stop words
tokens <- TextData %>%
  unnest_tokens(word, Complaint) %>%
  anti_join(stop_words) %>%
  filter(!is.na(word))

# Join tokens with stratified data
tokens <- tokens %>%
  inner_join(TextData %>% select(Complaint_ID, Business, Month, Type), by = "Complaint_ID") %>%
  select(-Business.y, -Month.y, -Type.y) %>%
  rename(
        Business = Business.x,
        Month    = Month.x,
        Type     = Type.x
    )

NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")

cat("\nThis dataset includes:\n")

This dataset includes:

cat("- Number of raw comments:", NumRawComments,"\n")
  • Number of raw comments: 23,801
cat("- Number of clean comments:", NumComments,"\n")
  • Number of clean comments: 18,043
cat("- Number of tokens:", NumTokens,"\n")
  • Number of tokens: 225,957
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
  • Comments between: 01 Jul 2020 to 30 Jun 2021

# Show Characteristics of Tokens
cat("\nShow First Rows of Tokens Table\n")

Show First Rows of Tokens Table

knitr::kable(head(tokens), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint_ID Month word
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 delivery
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 date
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 delivery
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 date
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 msg
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 kindly

3.8 Explore Cleaned Data

Data frame is explored by showing available columns, structure and first five rows of data.


# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Attempt to remove any problematic characters, including control and special characters
TextData <- TextData %>%
  mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = ""))  # Removes invalid characters

cat("\nDescriptive Statistics of Columns in TextData:\n")

Descriptive Statistics of Columns in TextData:

st(TextData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
         Variable     N   Mean SD Min P25 P50 P75 Max NA Mode

1 Business 18043
2 … Division A 12671 70.23%
3 … Division B 5372 29.77%
4 Type 18043
5 … Cancel 1618 8.97%
6 … Changes 9332 51.72%
7 … Delivery Status 993 5.5%
8 … Feedback 3419 18.95%
9 … Not Delivered 2681 14.86%


# Show Characteristics of Tokens
cat("\nShow First Rows of Tokens Table\n")

Show First Rows of Tokens Table

knitr::kable(head(tokens), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint_ID Month word
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 delivery
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 date
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 delivery
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 date
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 msg
Division A 2020-07-01 Changes 7101436529 3 2020-07-01 kindly

cat("\n\nShow Text Data Frequency Table\n")

Show Text Data Frequency Table

knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business EnqDate Type Order Complaint Complaint_ID Month
Division A 2020-07-01 Changes 7101436529 old delivery date to am new delivery date to am msg kindly assist to amend the message and add in 3 2020-07-01
Division A 2020-07-01 Changes 7101436521 old delivery date to at new delivery date to at msg please change the father to one as below with deepest sympathy heartfelt condolences on the demise of your beloved one 4 2020-07-01
Division A 2020-07-01 Delivery Status 7101436479 kindly check delivery status for today pm delivery and called back ms feonne soo or feonnesoo gmail com thank you 5 2020-07-01
Division A 2020-07-01 Delivery Status 7101436235 recipient husband called claimed was told by wife suppose to receive item from us however till now yet to receive mentioned did receive several missed call but not sure is it from us please assist to check revert to recipient husband mr sakthi in system st attempt was failed but subsequent delivery status but custs never receive 6 2020-07-01
Division B 2020-07-01 Cancel 4101073613 please cancel this order due to no stock item in line 7 2020-07-01
Division B 2020-07-01 Feedback 4101073586 please refund eca due to no stock sap only key amount for xba with no delivery fee 8 2020-07-01

4 Sentiment Analysis by Business, Month and Type Using Bing Lexicon

Using the established lexicon Bing, sentiments are assigned to tokens. The result of positive and negative sentiments by business and month is printed in bar charts.


# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")

# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
  inner_join(bing_lexicon, by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Business, Month, Type) %>%
  summarise(sentiment_score = sum(score)) %>%
  ungroup()

# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "dodge") +  # Stacked bars for each Business within each Month
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)

# Improved sentiment visualization with stacked bars by sentiment type 
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "stack") +  # Stack positive and negative sentiments in each bar
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)

6 Pareto Chart of Comment Types

The number of comments is stratified by comment types and shown in a Pareto chart.


# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
  count(Type, sort = TRUE) %>%
  mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)

# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +                       # Bars for frequency
  geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1),  # Line for cumulative %
            color = "red", size = 1) +
  geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") +  # Points on the line
  scale_y_continuous(
    sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage")  # Secondary y-axis for %
  ) +
  labs(title = "Pareto Chart of Comment Types",
       x = NULL,
       y = "Frequency") +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)

7 Word Frequency Analysis by Business, Month, and Type

The most common words in comments are extracted by business and comment type and displayed in bar charts.


# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
  group_by(Business, Type, word) %>%
  summarise(freq = n()) %>%
  arrange(desc(freq))

# Visualize top words by stratified groups
top_words <- word_freq %>%
  group_by(Business, Type) %>%
  ungroup()

# Filter for top 10 words across all types
top_10_words <- top_words %>%
  group_by(Type) %>%
  slice_max(order_by = freq, n = 10) %>%  # Select the top 10 words by frequency within each Type
  ungroup()

# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
  geom_col(show.legend = TRUE) +
  facet_wrap(~ Type, scales = "free", ncol = 2) +
  coord_flip() +
  labs(title = "Top Words by Business and Type",
       x = "Words", y = "Frequency") +
  scale_x_reordered() +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)

cat("\nShow Top 10 Words Table\n")

Show Top 10 Words Table

knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
Business Type word freq
Division A Cancel cancel 819
Division B Cancel cancel 417
Division A Cancel customer 375
Division A Cancel reprocess 311
Division A Cancel line 288
Division A Cancel delivery 230

cat("\nShow Top 10 Terms Table\n")

Show Top 10 Terms Table

knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from TextData
new(“standardGeneric”, .Data = function (x, …)
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”),
package = “stats”, group = list(), valueClass = character(0),
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x,
…)
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”,

8 Topic Modelling by Business, Month, and Type Using LDA

8.1 Identifying the Best Number of Topics for LDA

Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.


# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)

# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
  count(Complaint_ID, word) %>%
  cast_dtm(Complaint_ID, word, n)

# Identify number of topics
LDA_result <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result)

8.2 Topic Modeling by Business, Month, and Type Using LDA

Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.


# Define Number of Topics
NumTopics = 6

# Perform LDA with a specified number of topics (e.g., 3 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))

# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup()

# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", ncol = 2) +
  coord_flip() +
  labs(title = "Top Terms per Topic",
       x = "Term",
       y = "Frequency") +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) + 
  scale_x_reordered()

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 10, height = 8)

8.3 Topic Modelling Results

Extract the most frequent terms per topic and business. This should help to assign themes to topics.


# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)

# Format of the resulting object
attributes(tmResult)

$names [1] “terms” “topics”

nTerms(dtm)              # lengthOfVocab

[1] 12337


beta <- tmResult$terms   # get beta from results
dim(beta)   

[1] 6 12337


rowSums(beta)            # rows in beta sum to 1

1 2 3 4 5 6 1 1 1 1 1 1


# Show 10 Most Frequent Terms per Topic
terms(lda_model, 10)
  Topic 1     Topic 2     Topic 3    Topic 4     Topic 5      Topic 6      

[1,] “delivery” “check” “customer” “delivery” “date” “pdo”
[2,] “date” “delivery” “cancel” “date” “delivery” “card”
[3,] “pm” “customer” “refund” “line” “message” “reprocess”
[4,] “msg” “sender” “invoice” “address” “msg” “shifted”
[5,] “change” “recipient” “kindly” “msg” “card” “condolences” [6,] “nt” “status” “item” “change” “change” “deepest”
[7,] “assist” “call” “amount” “remove” “pm” “beloved”
[8,] “recipient” “hamper” “assist” “street” “pte” “cancel”
[9,] “add” “item” “due” “singapore” “family” “code”
[10,] “pls” “delivered” “charge” “unit” “management” “sympathy”


# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
                               Topic 1 
         "delivery date pm msg change" 
                               Topic 2 

“check delivery customer sender recipient” Topic 3 “customer cancel refund invoice kindly” Topic 4 “delivery date line address msg” Topic 5 “date delivery message msg card” Topic 6 “pdo card reprocess shifted condolences”


# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics 
dim(theta)   

[1] 17990 6

8.4 Visualise Modelling Results

Display the most frequent terms per topic and business in wordcoulds.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)

# Saving each plot as an image
for(topicToViz in 1:NumTopics)  {
  
  # Visualize topics as word cloud
  topicToViz <- topicToViz # change for your own topic of interest
  
  # Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
  top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:50]
  words <- names(top40terms)
  
  # Extract the probabilites of each of the 40 terms
  probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:50]
  
  # Visualize the terms as wordcloud
  mycolors <- brewer.pal(8, "Dark2")
  wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

  # Save the plot
  # ggsave(filename = paste("Word Cloud ", wordcloud, ".png", sep = ""), plot = p, width = 8, height = 5)
  
}

8.5 Visualise Topic Distribution within Documents

Identify the distribution of topics in documents, i.e., comments.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)

# Convert TextData$Complaint to a data frame if it's not already
textdata <- data.frame(text = TextData$Complaint, doc_id = TextData$Complaint_ID, stringsAsFactors = FALSE)

# Load stopwords
english_stopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")

# Create corpus object
corpus <- Corpus(DataframeSource(textdata))

# Preprocessing chain
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, english_stopwords)
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)

# Make Examples
exampleIds <- c(2, 222, 1111, 2224, 3334)

# Show sample comments
cat("\nShow sample documents, i.e., comments:\n\n")

Show sample documents, i.e., comments:

lapply(corpus[exampleIds], as.character)

$4 [1] “old delivery date to at new delivery date to at msg please change the father to one as below with deepest sympathy heartfelt condolences on the demise of your beloved one”

$357 [1] “sender mention that the item is not delivered yesterday morning and recipient discharged yesterday afternoon please arrange to resend to b rivervale drive singapore thank you”

$1646 [1] “please charge additional as there is a delivery charge for this corporate account please email customer the revised invoice”

$3355 [1] “change product code pls cancel this order and will reprocess”

$5084 [1] “old delivery date to pm new delivery date to nt msg please change the delivery timing tonight pm pm”


N <- length(exampleIds)


# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]

colnames(topicProportionExamples) <- topicNames

vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)), 
                     variable.name = "topic", id.vars = "document")  

Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("2" = "lightgreen", "222" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
                    labels = c("Com 2", "Com 222", "Com 1111", "Com 2224", "Com 3334"),
                    name = "Comment") +  # Custom legend title and labels
  facet_wrap(~ document, ncol = 5) +
    labs(title = paste("Topic Proportions in Sample Comments"),
         x = NULL, y = "Topic") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
  ) 

# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)

9 Emotion Analysis by Business, Month, and Type

An emotion analysis is performed using the NRC lexicon to attach emotions to each comment and show them over time.


# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")

# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
  inner_join(nrc_lexicon, by = "word") %>%
  count(Business, Month, Type, sentiment) %>%
  spread(sentiment, n, fill = 0)

# Reshape data for plotting
emotion_long <- emotions %>%
  gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
  mutate(Month = ymd(Month))  %>%   # Requires lubridate package
  mutate(emotion = str_to_title(emotion))

# Define custom colors for each emotion
emotion_colors <- c(
  "Anger" = "firebrick",
  "Anticipation" = "lightgrey",
  "Disgust" = "brown",
  "Fear" = "tomato",
  "Joy" = "lightgreen",
  "Negative" = "indianred",
  "Positive" = "forestgreen",
  "Sadness" = "salmon",
  "Surprise" = "palegreen",
  "Trust" = "darkolivegreen"
)

# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ Type, scales = "free") +
  labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
       x = NULL, y = "Emotion Count") +
  theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 


# Unique Types in the data
unique_types <- unique(emotion_long$Type)

# Loop over each type and create an independent plot
for (type in unique_types) {
  
  # Filter data for the current Type
  data_subset <- emotion_long %>% filter(Type == type)
  
  # Plot for the current Type
  Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
    geom_bar(stat = "identity", position = "stack") +
    scale_fill_manual(values = emotion_colors, name = "Emotion") +  # Assign custom colors
    labs(title = paste("Emotion Analysis for Type:", type),
         x = NULL, y = "Emotion Count") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

  # Print the plot
  print(Plot) 

  # Save the plot
  ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
  
}