# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
# Global Settings
options(digits = 4)
options(scipen = 999)
setwd("~/AD Data/TextMining/R")
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.
# Add ID
TextData <- TextData[, -2]
TextData <- TextData[order(TextData$EnqDate),]
TextData$Complaint_ID <- seq.int(nrow(TextData))
# Save Original
TextDataRaw <- TextData
# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))
Data frame is explored by showing available columns, structure and first five rows of data.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Show Characteristics of Data Frame
cat("\nShow First Rows of Data Table")
Show First Rows of Data Table
TextData <- TextData %>% mutate_all(~iconv(., from = "", to = "UTF-8", sub = ""))
knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | Complaint | Complaint_ID |
---|---|---|---|---|---|
Division A | 2020-07-01 | OTHERS | 4151596/4151811/4152515/4153389/4153473 | No information | 1 |
Division A | 2020-07-01 | OTHERS | 1627628/4145180/4146985/4147637 | No information | 2 |
Division A | 2020-07-01 | CHANGES | 7101436529 |
Old Delivery Date: 02/07/2020 TO 02/07/2020 AM New Delivery Date : 02/07/2020 TO 02/07/2020 AM 3.2 Msg :kindly assist to amend the message and add in Ǿ϶Ǿ?ǾǾ? Ǿ϶Ǿ?ǾǾ? ?݃??Ǿ?ݶǸ? |
3 |
Division A | 2020-07-01 | CHANGES | 7101436521 |
Old Delivery Date: 01/07/2020 TO 01/07/2020 AT New Delivery Date : 01/07/2020 TO 01/07/2020 AT 3.3 Msg :Please change the Father to One as below With Deepest Sympathy & Heartfelt Condolences On The Demise Of Your Beloved One |
4 |
Division A | 2020-07-01 | DELIVERY STATUS | 7101436479 |
kindly check delivery status for today PM delivery and called back. Thank you |
5 |
Division A | 2020-07-01 | DELIVERY STATUS | 7101436235 | Recipient husband called,claimed was told by wife suppose to receive item from us however till now yet to receive. Mentioned did receive several missed call but not sure is it from us. Please assist to check & revert to Recipient husband. In system 1st attempt was failed but subsequent delivery status 4 but custs never receive. | 6 |
Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.
# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")
# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)
# Omitting any NA values
TextData <- TextData %>%
filter(!is.na(Complaint)) %>%
mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
mutate(Type = str_to_title(Type)) # Convert to titlecase
# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)
# Extract month (in "Year-Month" format)
TextData <- TextData %>%
mutate(Month = floor_date(EnqDate, "month"))
# Show Characteristics of Data Frame
cat("\nShow First Rows of Data Table\n")
Show First Rows of Data Table
knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | Complaint | Complaint_ID | Month |
---|---|---|---|---|---|---|
Division A | 2020-07-01 | Others | 4151596/4151811/4152515/4153389/4153473 | no information | 1 | 2020-07-01 |
Division A | 2020-07-01 | Others | 1627628/4145180/4146985/4147637 | no information | 2 | 2020-07-01 |
Division A | 2020-07-01 | Changes | 7101436529 |
old delivery date: 02/07/2020 to 02/07/2020 am new delivery date : 02/07/2020 to 02/07/2020 am 3.5 msg :kindly assist to amend the message and add in ?? ?? ????? |
3 | 2020-07-01 |
Division A | 2020-07-01 | Changes | 7101436521 |
old delivery date: 01/07/2020 to 01/07/2020 at new delivery date : 01/07/2020 to 01/07/2020 at 3.6 msg :please change the father to one as below with deepest sympathy & heartfelt condolences on the demise of your beloved one |
4 | 2020-07-01 |
Division A | 2020-07-01 | Delivery Status | 7101436479 |
Kindly check delivery status for today pm delivery and called back. Thank you |
5 | 2020-07-01 |
Division A | 2020-07-01 | Delivery Status | 7101436235 | recipient husband called, claimed was told by wife suppose to receive item from us however till now yet to receive. mentioned did receive several missed call but not sure is it from us. please assist to check & revert to recipient husband. in system 1st attempt was failed but subsequent delivery status 4 but custs never receive. | 6 | 2020-07-01 |
Punctuation, numbers, stop words (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Empty comments and comments containing “no information” are omitted.
# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint)
TextData$Complaint <- removePunctuation(TextData$Complaint)
TextData$Complaint <- removeNumbers(TextData$Complaint)
TextData <- TextData %>%
filter(Complaint != "no information") %>%
mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))
# Define custom stopwords (e.g., names)
custom_stopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names
# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
unnest_tokens(word, Complaint) %>%
filter(!word %in% custom_stopwords) %>%
filter(!str_detect(word, date_pattern)) %>%
filter(!str_detect(word, name_pattern)) %>%
count(word, sort = TRUE)
# Tokenize text data and remove stop words
tokens <- TextData %>%
unnest_tokens(word, Complaint) %>%
anti_join(stop_words) %>%
filter(!is.na(word))
# Join tokens with stratified data
tokens <- tokens %>%
inner_join(TextData %>% select(Complaint_ID, Business, Month, Type), by = "Complaint_ID") %>%
select(-Business.y, -Month.y, -Type.y) %>%
rename(
Business = Business.x,
Month = Month.x,
Type = Type.x
)
NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")
cat("\nThis dataset includes:\n")
This dataset includes:
cat("- Number of raw comments:", NumRawComments,"\n")
cat("- Number of clean comments:", NumComments,"\n")
cat("- Number of tokens:", NumTokens,"\n")
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
# Show Characteristics of Tokens
cat("\nShow First Rows of Tokens Table\n")
Show First Rows of Tokens Table
knitr::kable(head(tokens), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | Complaint_ID | Month | word |
---|---|---|---|---|---|---|
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | delivery |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | date |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | delivery |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | date |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | msg |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | kindly |
Data frame is explored by showing available columns, structure and first five rows of data.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Attempt to remove any problematic characters, including control and special characters
TextData <- TextData %>%
mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = "")) # Removes invalid characters
cat("\nDescriptive Statistics of Columns in TextData:\n")
Descriptive Statistics of Columns in TextData:
st(TextData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
Variable N Mean SD Min P25 P50 P75 Max NA Mode
1 Business 18043
2 … Division A 12671 70.23%
3 … Division B 5372 29.77%
4 Type 18043
5 … Cancel 1618 8.97%
6 … Changes 9332 51.72%
7 … Delivery Status 993 5.5%
8 … Feedback 3419 18.95%
9 … Not Delivered 2681 14.86%
# Show Characteristics of Tokens
cat("\nShow First Rows of Tokens Table\n")
Show First Rows of Tokens Table
knitr::kable(head(tokens), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | Complaint_ID | Month | word |
---|---|---|---|---|---|---|
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | delivery |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | date |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | delivery |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | date |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | msg |
Division A | 2020-07-01 | Changes | 7101436529 | 3 | 2020-07-01 | kindly |
cat("\n\nShow Text Data Frequency Table\n")
Show Text Data Frequency Table
knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | Complaint | Complaint_ID | Month |
---|---|---|---|---|---|---|
Division A | 2020-07-01 | Changes | 7101436529 | old delivery date to am new delivery date to am msg kindly assist to amend the message and add in | 3 | 2020-07-01 |
Division A | 2020-07-01 | Changes | 7101436521 | old delivery date to at new delivery date to at msg please change the father to one as below with deepest sympathy heartfelt condolences on the demise of your beloved one | 4 | 2020-07-01 |
Division A | 2020-07-01 | Delivery Status | 7101436479 | kindly check delivery status for today pm delivery and called back ms feonne soo or feonnesoo gmail com thank you | 5 | 2020-07-01 |
Division A | 2020-07-01 | Delivery Status | 7101436235 | recipient husband called claimed was told by wife suppose to receive item from us however till now yet to receive mentioned did receive several missed call but not sure is it from us please assist to check revert to recipient husband mr sakthi in system st attempt was failed but subsequent delivery status but custs never receive | 6 | 2020-07-01 |
Division B | 2020-07-01 | Cancel | 4101073613 | please cancel this order due to no stock item in line | 7 | 2020-07-01 |
Division B | 2020-07-01 | Feedback | 4101073586 | please refund eca due to no stock sap only key amount for xba with no delivery fee | 8 | 2020-07-01 |
Using the established lexicon Bing, sentiments are assigned to tokens. The result of positive and negative sentiments by business and month is printed in bar charts.
# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")
# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
inner_join(bing_lexicon, by = "word") %>%
mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
group_by(Business, Month, Type) %>%
summarise(sentiment_score = sum(score)) %>%
ungroup()
# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "dodge") + # Stacked bars for each Business within each Month
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)
# Improved sentiment visualization with stacked bars by sentiment type
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "stack") + # Stack positive and negative sentiments in each bar
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)
The number of comments is stratified by comment types and shown in a Pareto chart.
# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
count(Type, sort = TRUE) %>%
mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)
# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") + # Bars for frequency
geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1), # Line for cumulative %
color = "red", size = 1) +
geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") + # Points on the line
scale_y_continuous(
sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage") # Secondary y-axis for %
) +
labs(title = "Pareto Chart of Comment Types",
x = NULL,
y = "Frequency") +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)
The most common words in comments are extracted by business and comment type and displayed in bar charts.
# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
group_by(Business, Type, word) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
# Visualize top words by stratified groups
top_words <- word_freq %>%
group_by(Business, Type) %>%
ungroup()
# Filter for top 10 words across all types
top_10_words <- top_words %>%
group_by(Type) %>%
slice_max(order_by = freq, n = 10) %>% # Select the top 10 words by frequency within each Type
ungroup()
# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
geom_col(show.legend = TRUE) +
facet_wrap(~ Type, scales = "free", ncol = 2) +
coord_flip() +
labs(title = "Top Words by Business and Type",
x = "Words", y = "Frequency") +
scale_x_reordered() +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)
cat("\nShow Top 10 Words Table\n")
Show Top 10 Words Table
knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | Type | word | freq |
---|---|---|---|
Division A | Cancel | cancel | 819 |
Division B | Cancel | cancel | 417 |
Division A | Cancel | customer | 375 |
Division A | Cancel | reprocess | 311 |
Division A | Cancel | line | 288 |
Division A | Cancel | delivery | 230 |
cat("\nShow Top 10 Terms Table\n")
Show Top 10 Terms Table
knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
new(“standardGeneric”, .Data = function (x, …) |
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”), |
package = “stats”, group = list(), valueClass = character(0), |
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x, |
…) |
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”, |
Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.
# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)
# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
count(Complaint_ID, word) %>%
cast_dtm(Complaint_ID, word, n)
# Identify number of topics
LDA_result <- FindTopicsNumber(
dtm,
topics = seq(from = 2, to = 15, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.
FindTopicsNumber_plot(LDA_result)
Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.
# Define Number of Topics
NumTopics = 6
# Perform LDA with a specified number of topics (e.g., 3 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))
# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup()
# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free", ncol = 2) +
coord_flip() +
labs(title = "Top Terms per Topic",
x = "Term",
y = "Frequency") +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
) +
scale_x_reordered()
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 10, height = 8)
Extract the most frequent terms per topic and business. This should help to assign themes to topics.
# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)
# Format of the resulting object
attributes(tmResult)
$names [1] “terms” “topics”
nTerms(dtm) # lengthOfVocab
[1] 12337
beta <- tmResult$terms # get beta from results
dim(beta)
[1] 6 12337
rowSums(beta) # rows in beta sum to 1
1 2 3 4 5 6 1 1 1 1 1 1
# Show 10 Most Frequent Terms per Topic
terms(lda_model, 10)
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
[1,] “delivery” “check” “customer” “delivery” “date” “pdo”
[2,] “date” “delivery” “cancel” “date” “delivery” “card”
[3,] “pm” “customer” “refund” “line” “message” “reprocess”
[4,] “msg” “sender” “invoice” “address” “msg” “shifted”
[5,] “change” “recipient” “kindly” “msg” “card” “condolences” [6,] “nt”
“status” “item” “change” “change” “deepest”
[7,] “assist” “call” “amount” “remove” “pm” “beloved”
[8,] “recipient” “hamper” “assist” “street” “pte” “cancel”
[9,] “add” “item” “due” “singapore” “family” “code”
[10,] “pls” “delivered” “charge” “unit” “management” “sympathy”
# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
Topic 1
"delivery date pm msg change"
Topic 2
“check delivery customer sender recipient” Topic 3 “customer cancel refund invoice kindly” Topic 4 “delivery date line address msg” Topic 5 “date delivery message msg card” Topic 6 “pdo card reprocess shifted condolences”
# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics
dim(theta)
[1] 17990 6
Display the most frequent terms per topic and business in wordcoulds.
# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
# Saving each plot as an image
for(topicToViz in 1:NumTopics) {
# Visualize topics as word cloud
topicToViz <- topicToViz # change for your own topic of interest
# Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:50]
words <- names(top40terms)
# Extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:50]
# Visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)
# Save the plot
# ggsave(filename = paste("Word Cloud ", wordcloud, ".png", sep = ""), plot = p, width = 8, height = 5)
}
Identify the distribution of topics in documents, i.e., comments.
# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)
# Convert TextData$Complaint to a data frame if it's not already
textdata <- data.frame(text = TextData$Complaint, doc_id = TextData$Complaint_ID, stringsAsFactors = FALSE)
# Load stopwords
english_stopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")
# Create corpus object
corpus <- Corpus(DataframeSource(textdata))
# Preprocessing chain
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, english_stopwords)
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)
# Make Examples
exampleIds <- c(2, 222, 1111, 2224, 3334)
# Show sample comments
cat("\nShow sample documents, i.e., comments:\n\n")
Show sample documents, i.e., comments:
lapply(corpus[exampleIds], as.character)
$4
[1] “old delivery date to at new delivery date to at
msg please change the father to one as below with deepest sympathy
heartfelt condolences on the demise of your beloved one”
$357
[1] “sender mention that the item is not delivered
yesterday morning and recipient discharged yesterday afternoon please
arrange to resend to b rivervale drive singapore thank you”
$1646
[1] “please charge additional as there is a
delivery charge for this corporate account please email customer the
revised invoice”
$3355
[1] “change product code pls cancel this order and
will reprocess”
$5084
[1] “old delivery date to pm new delivery date to
nt msg please change the delivery timing tonight pm pm”
N <- length(exampleIds)
# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]
colnames(topicProportionExamples) <- topicNames
vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)),
variable.name = "topic", id.vars = "document")
Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_manual(values = c("2" = "lightgreen", "222" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
labels = c("Com 2", "Com 222", "Com 1111", "Com 2224", "Com 3334"),
name = "Comment") + # Custom legend title and labels
facet_wrap(~ document, ncol = 5) +
labs(title = paste("Topic Proportions in Sample Comments"),
x = NULL, y = "Topic") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
)
# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)
An emotion analysis is performed using the NRC lexicon to attach emotions to each comment and show them over time.
# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")
# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
inner_join(nrc_lexicon, by = "word") %>%
count(Business, Month, Type, sentiment) %>%
spread(sentiment, n, fill = 0)
# Reshape data for plotting
emotion_long <- emotions %>%
gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
mutate(Month = ymd(Month)) %>% # Requires lubridate package
mutate(emotion = str_to_title(emotion))
# Define custom colors for each emotion
emotion_colors <- c(
"Anger" = "firebrick",
"Anticipation" = "lightgrey",
"Disgust" = "brown",
"Fear" = "tomato",
"Joy" = "lightgreen",
"Negative" = "indianred",
"Positive" = "forestgreen",
"Sadness" = "salmon",
"Surprise" = "palegreen",
"Trust" = "darkolivegreen"
)
# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ Type, scales = "free") +
labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
x = NULL, y = "Emotion Count") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Unique Types in the data
unique_types <- unique(emotion_long$Type)
# Loop over each type and create an independent plot
for (type in unique_types) {
# Filter data for the current Type
data_subset <- emotion_long %>% filter(Type == type)
# Plot for the current Type
Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = emotion_colors, name = "Emotion") + # Assign custom colors
labs(title = paste("Emotion Analysis for Type:", type),
x = NULL, y = "Emotion Count") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the plot
print(Plot)
# Save the plot
ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
}