1 Load Necessary Libraries and Make Some Settings
2 Get Data and Add ID
3 Prepare Data
4 Sentiment Analysis by Business, Month and Type Using Bing Lexicon
5 Show Sentiments Related to Billing
6 Pareto Chart of Comment Types
7 Top Words by Business and Type
8 Topic Modelling by Business, Month, and Type Using LDA
- 8.1 Topic Modeling by Business, Month, and Type Using LDA
9 A tibble: 4 × 2
10 Emotion Analysis by Business, Month, and Type

1 Load Necessary Libraries and Make Some Settings


# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
if(! "DT" %in% installed.packages()) { install.packages("DT", dependencies = TRUE) }
library(DT)


# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("C:/Users/uwe/OneDrive/Documents/AD Data/TextMining/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Get Data and Add ID

Purpose: To ensure all data is ready for analysis without missing or incompatible entries. Explanation: This phase involves gathering and loading the data into R for processing. The dataset, presumably containing text data like customer comments or feedback, was imported into the R environment using read.csv(). Any encoding issues were handled to ensure compatibility. Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.


# Get Your Text Data and Save Them in TextData

# Add ID
TextData    <- TextDataRaw[, -2]
TextData    <- TextData[order(TextData$EnqDate),]
TextData$ID <- seq.int(nrow(TextData))

datatable(head(TextData, 12),  options = list(pageLength = 12))


# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))

3 Prepare Data

3.1 Clean Data and add Month Column

Purpose: To prepare the text data for analysis by removing noise and ensuring that only meaningful content is retained. Explanation: Non-ASCII characters, punctuation, and numbers were removed.Text was converted to lowercase to ensure consistency. Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.


# Function to check and remove rows where Old and New delivery dates are the same
# TextData <- TextData[!grepl("Old Delivery Date: (.+?) TO \\1 (AM|PM).*New Delivery Date : \\1 TO \\1 (AM|PM)", TextData$Complaint), ]

# Replace "Old Delivery Date" and "New Delivery Date" expressions with an empty string
TextData$Complaint <- gsub(
  "Old Delivery Date:.*?TO.*?(AM|PM|NT|[A-Z]+).*?New Delivery Date :.*?TO.*?(AM|PM|NT|[A-Z]+)",
  "",
  TextData$Complaint,
  ignore.case = TRUE
)

# Replace "msg" with an empty string
TextData$Complaint <- gsub("msg", "", TextData$Complaint, ignore.case = TRUE)

# Handle any leftover instances of "old delivery date" and "new delivery date"
#TextData$Complaint <- gsub("(?i)Old Delivery Date:.*?$|New Delivery Date :.*?$", "", TextData$Complaint, perl = TRUE)

# Trim any leading or trailing whitespace caused by the removal
TextData$Complaint <- trimws(TextData$Complaint)

# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)

# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")

# Attempt to remove any problematic characters, including control and special characters
# TextData <- TextData %>% mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = ""))  # Removes invalid characters

# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)

# Omitting any NA values
TextData <- TextData %>%
  filter(!is.na(Complaint)) %>%
  mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
  mutate(Type = str_to_title(Type)) # Convert to titlecase

# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)

# Extract month (in "Year-Month" format)
TextData <- TextData %>%
  mutate(Month = floor_date(EnqDate, "month"))

# Generate the summary
df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
print(df_summary)

Data Frame Summary
TextData
Dimensions: 23801 x 7
Duplicates: 0

No	Variable	Stats / Values	Freqs (% of Valid)	Graph	Valid
1	Business [character]	1. Division A 2. Division B	16908 (71.0%) 6893 (29.0%)	IIIIIIIIIIIIII IIIII	23801 (100.0%)
2	EnqDate [Date]	min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d	314 distinct values	: : : . : : . . . : : : : : : . : : : : : : : : : :	23801 (100.0%)
3	Type [character]	1. Cancel 2. Changes 3. Credit Card 4. Delivery Status 5. Feedback 6. Not Delivered 7. Others	1618 ( 6.8%) 9332 (39.2%) 26 ( 0.1%) 993 ( 4.2%) 3419 (14.4%) 2681 (11.3%) 5732 (24.1%)	I IIIIIII	23801 (100.0%)
4	Order [character]	1. (Empty string) 2. - 3. 0 4. 3189452 5. 3189100 6. 3217360 7. 5159916 8. 5160062 9. 5172125 10. 3217115 [ 18620 others ]	795 ( 3.3%) 138 ( 0.6%) 77 ( 0.3%) 38 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 24 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 15 ( 0.1%) 22629 (95.1%)		23801 (100.0%)
5	Complaint [character]	1. no information 2. pdo - not in 3. pdo - shifted 4. pdo- not in 5. pdo - shifted. 6. pdo - closed 7. pdo - no such person 8. pls update delivery statu 9. pdo - discharged 10. pdo - closed. [ 15767 others ]	5758 (24.2%) 357 ( 1.5%) 177 ( 0.7%) 124 ( 0.5%) 78 ( 0.3%) 60 ( 0.3%) 47 ( 0.2%) 41 ( 0.2%) 37 ( 0.2%) 35 ( 0.1%) 17087 (71.8%)	IIII	23801 (100.0%)
6	ID [integer]	Mean (sd) : 11901 (6871) min < med < max: 1 < 11901 < 23801 IQR (CV) : 11900 (0.6)	23801 distinct values (Integer sequence)	: : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : :	23801 (100.0%)
7	Month [Date]	min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d	13 distinct values	: : : . . : . . : . : : : : : : : :	23801 (100.0%)


# Show Characteristics of Data Frame
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

3.2 Preprocess Data

Punctuation, numbers, stopwords (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Any comments marked as “no information” or with NA values were excluded.


# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint) 
TextData$Complaint <- removePunctuation(TextData$Complaint) 
TextData$Complaint <- removeNumbers(TextData$Complaint) 

# Remove rows with "No Information" in the Complaint column
TextData <- TextData[TextData$Complaint != "no information", ]

# Change all pdo to order
TextData$Complaint <- gsub("\\bpdo\\b", "order", TextData$Complaint)

# Load stopwords and remove them
EnglishStopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")
EnglishStopwords <- c(EnglishStopwords,'pte', 'ltd')
EnglishStopwordsPattern <- paste0("\\b(", paste(EnglishStopwords, collapse = "|"), ")\\b")

# Remove stopwords from the Complaints column
TextData <- TextData %>%
  mutate(Complaint = str_remove_all(Complaint, EnglishStopwordsPattern))

# Define custom stopwords (e.g., names) and remove them
CustomStopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
CustomStopwordsPattern <- paste0("\\b(", paste(CustomStopwords, collapse = "|"), ")\\b")

# Remove stopwords from the Complaint column
TextData <- TextData %>%
  mutate(Complaint = str_remove_all(Complaint, CustomStopwordsPattern))

date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names

TextData <- TextData %>%
    filter(Complaint != "no information") %>%
    filter(Complaint != "old") %>%
    filter(!str_detect(Complaint, date_pattern)) %>%
    filter(!str_detect(Complaint, name_pattern)) %>%
    filter(!str_detect(Type, "Credit Card")) %>%                        # Remove Type Credit Card
    mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))

# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
    unnest_tokens(word, Complaint) %>%
    count(word, sort = TRUE)

# Remove extra spaces
TextData <- TextData %>%
  mutate(Complaint = str_squish(Complaint))

# Write Data to WD
write.csv(TextData, file = paste(today, "TextDataClean.csv"))
write.csv(TextDatafreq, file = paste(today, "TextDatafreq.csv"))
write.csv(EnglishStopwords, file = paste(today, "EnglishStop.csv"))
write.csv(CustomStopwords, file = paste(today, "CustomStop.csv"))

3.3 Tokenise Data

Purpose: To enable word-level analysis, which is crucial for frequency analysis, sentiment analysis, and topic modeling. Explanation: Text data was split into individual words (tokens) using unnest_tokens(). This step transforms unstructured text into structured data, making it easier to analyse.


# Loading other packages if not available
if(! "textstem" %in% installed.packages()) { install.packages("textstem", dependencies = TRUE) }
library(textstem)

# Tokenise text data and remove stop words
tokens <- TextData %>%
  unnest_tokens(word, Complaint) %>%
  anti_join(stop_words) %>%
  filter(!is.na(word))

# Lemmatise Tokens
tokensOld <- tokens
tokens$word <- lemmatize_words(tokens$word)

# Join tokens with stratified data
tokens <- tokens %>%
  inner_join(TextData %>% select(ID, Business, Month, Type), by = "ID") %>%
  select(-Business.y, -Month.y, -Type.y) %>%
  rename(
        Business = Business.x,
        Month    = Month.x,
        Type     = Type.x
    )

NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")

cat("\nThis dataset includes:\n")

This dataset includes:

cat("- Number of raw comments:", NumRawComments,"\n")

Number of raw comments: 23,801

cat("- Number of clean comments:", NumComments,"\n")

Number of clean comments: 18,043

cat("- Number of tokens:", NumTokens,"\n")

Number of tokens: 164,036

cat("- Comments between:", MinDate, " to ", MaxDate,"\n")

Comments between: 07 Jul 2022 to 06 Jul 2023


# Generate the summary
# df_summary <- dfSummary(tokens)

# View the summary in the RStudio Viewer or browser
# print(df_summary)

# Write Data to WD
write.csv(tokens, file = paste(today, "Tokens.csv"))

3.4 Explore Cleaned Data

Purpose: To gain initial insights into the dataset and identify patterns or anomalies. Explanation: Summary Statistics: The dataset’s structure, number of entries, and date range were explored. Visualisation: Bar charts and Pareto charts were used to highlight common trends, such as the frequency of different comment types. Data frame is explored by showing available columns, structure and first five rows of data.


# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Set Date to Date Format
TextData$EnqDate <- as.Date(TextData$EnqDate)
TextData$Month <- as.Date(TextData$Month)

# Generate the summary
# df_summary <- dfSummary(TextData)

# View the summary in the RStudio Viewer or browser
# print(df_summary)

# Show Characteristics of Tokens
knitr::kable(head(tokens), format = "html", caption = "Sample Data from Tokens") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Sample Data from Tokens
Business	EnqDate	Type	Order	ID	Month	word
Division A	2022-07-07	Changes	7101436529	3	2022-07-01	kindly
Division A	2022-07-07	Changes	7101436529	3	2022-07-01	assist
Division A	2022-07-07	Changes	7101436529	3	2022-07-01	amend
Division A	2022-07-07	Changes	7101436529	3	2022-07-01	message
Division A	2022-07-07	Changes	7101436529	3	2022-07-01	add
Division A	2022-07-07	Changes	7101436521	4	2022-07-01	change


knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Sample Data from TextData
Business	EnqDate	Type	Order	Complaint	ID	Month
Division A	2022-07-07	Changes	7101436529	kindly assist amend message add	3	2022-07-01
Division A	2022-07-07	Changes	7101436521	change father deepest sympathy heartfelt condolences demise beloved	4	2022-07-01
Division A	2022-07-07	Delivery Status	7101436479	kindly check delivery status today pm delivery called back ms feonne soo feonnesoo gmail	5	2022-07-01
Division A	2022-07-07	Delivery Status	7101436235	recipient husband called claimed told wife suppose receive item till receive mentioned receive missed call assist check revert recipient husband mr sakthi system st attempt failed subsequent delivery status custs receive	6	2022-07-01
Division B	2022-07-07	Cancel	4101073613	cancel order due stock item line	7	2022-07-01
Division B	2022-07-07	Feedback	4101073586	refund eca due stock sap key amount xba delivery fee	8	2022-07-01

4 Sentiment Analysis by Business, Month and Type Using Bing Lexicon

Purpose: To assess the emotional tone of the text data, identifying areas with strong positive or negative sentiments. Explanation: Sentiment lexicons like Bing or NRC were used to classify words as positive, negative, or associated with specific emotions. Sentiment scores were calculated and visualized over time and by business type. The result of positive and negative sentiments by business and month is printed in bar charts.


# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")

# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
  inner_join(bing_lexicon, by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Business, Month, Type) %>%
  summarise(sentiment_score = sum(score)) %>%
  ungroup()

# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "dodge") +  # Stacked bars for each Business within each Month
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)

# Improved sentiment visualization with stacked bars by sentiment type 
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_col(position = "stack") +  # Stack positive and negative sentiments in each bar
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)

5 Show Sentiments Related to Billing

All sentiments related to words like “billing”, “charge”, “invoice” and “payment” are filtered out of the comments and plot by business over time.


# Filter complaints that mention billing-related keywords
billing_complaints <- TextData %>%
  filter(grepl("billing|charge|invoice|payment", Complaint, ignore.case = TRUE))

# Tokenize and calculate sentiment score for billing complaints
billing_tokens <- billing_complaints %>%
  unnest_tokens(word, Complaint) %>%
  inner_join(bing_lexicon, by = "word") %>%
  mutate(sentiment_score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Business, Month, Type) %>%
  summarise(total_sentiment = sum(sentiment_score)) %>%
  ungroup()

# Improved sentiment visualization with stacked bars by sentiment type 
Plot <- ggplot(billing_tokens, aes(x = Month, y = total_sentiment, fill = total_sentiment > 0)) +
  geom_col(position = "stack") +  # Stack positive and negative sentiments in each bar
  facet_wrap(~ Business, ncol = 1) +  # Facet by Business
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
                    labels = c("Positive", "Negative"),
                    limits = c("TRUE", "FALSE"),  # Set order of legend items
                    name = "Sentiment") +  # Custom legend title and labels
  labs(title = "Billing Related Sentiment Analysis by Business and Month",
       x = NULL, y = "Sentiment Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) +
  scale_x_date(date_labels = "%b %Y")

# Print the Plot
Plot


str(billing_tokens)

tibble [90 × 4] (S3: tbl_df/tbl/data.frame) $ Business : chr [1:90] “Division A” “Division A” “Division A” “Division A” … $ Month : Date[1:90], format: “2022-07-01” “2022-07-01” … $ Type : chr [1:90] “Cancel” “Changes” “Feedback” “Not Delivered” … $ total_sentiment: num [1:90] -1 1 21 -1 2 18 1 4 22 1 …


# Save the plot
ggsave(filename = paste("Billing Related Sentiments by Business and Month", ".png", sep = ""), plot = Plot, width = 8, height = 5)

6 Pareto Chart of Comment Types

The number of comments is stratified by comment types and shown in a Pareto chart.


# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
  count(Type, sort = TRUE) %>%
  mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)

# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +                       # Bars for frequency
  geom_text(aes(label = n), vjust = -0.5, size = 4) +
  geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1),  # Line for cumulative %
    color = "red", size = 1) +
  geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") +  # Points on the line
  geom_text(aes(
    y = cumulative_percentage * max(type_frequency$n) / 100, 
    label = sprintf("%.1f%%", cumulative_percentage)
    ), color = "red", vjust = -0.5, size = 4) +
  scale_y_continuous(
    sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage")  # Secondary y-axis for %
  ) +
  labs(title = "Pareto Chart of Comment Types",
       x = NULL,
       y = "Frequency") +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)

7 Top Words by Business and Type

Purpose: To understand the key terms or themes in the dataset and their relevance across different categories. Explanation: The most frequently occurring words were identified and visualised. The most common words in comments are extracted by business and comment type and displayed in bar charts.


# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
  group_by(Business, Type, word) %>%
  summarise(freq = n()) %>%
  arrange(desc(freq))

# Visualize top words by stratified groups
top_words <- word_freq %>%
  group_by(Business, Type) %>%
  ungroup()

# Filter for top 10 words across all types
top_10_words <- top_words %>%
  group_by(Type) %>%
  slice_max(order_by = freq, n = 10) %>%  # Select the top 10 words by frequency within each Type
  ungroup()

# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
  geom_col(show.legend = TRUE) +
  facet_wrap(~ Type, scales = "free", ncol = 3) +
  coord_flip() +
  labs(title = "Top Words by Business and Type",
       x = "Words", y = "Frequency") +
  scale_x_reordered() +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
  theme(
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
) +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  )

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".jpg", sep = ""), plot = Plot, width = 12, height = 7)

cat("\nShow Top 10 Words Table\n")

Show Top 10 Words Table

knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Sample Data from TextData
Business	Type	word	freq
Division A	Cancel	cancel	829
Division B	Cancel	cancel	559
Division A	Cancel	customer	384
Division A	Cancel	reprocess	359
Division A	Cancel	line	305
Division A	Cancel	item	240


cat("\nShow Top 10 Terms Table\n")

Show Top 10 Terms Table

knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Sample Data from TextData

new(“standardGeneric”, .Data = function (x, …)
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”),
package = “stats”, group = list(), valueClass = character(0),
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x,
…)
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”,

8 Topic Modelling by Business, Month, and Type Using LDA

Purpose: To uncover the underlying themes or categories within the dataset, providing a high-level view of the content. Explanation: Latent Dirichlet Allocation (LDA) was applied to discover hidden topics within the text. The optimal number of topics was determined using metrics like Griffiths2004 and CaoJuan2009. The most frequent terms within each topic were extracted to assign meaningful labels to topics. ## Identifying the Best Number of Topics for LDA following Arun (2010), CaoJuan (2009), Deveaud (2014), Griffiths (2004) Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.


# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)

# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
  count(ID, word) %>%
  cast_dtm(ID, word, n)

# Identify number of topics using all four indicators (Griffiths2004, CaoJuan2009, Arun2010, Deveaud2014)
LDA_result4 <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result4)


# Identify number of topics using only two indicators (CaoJuan2009, Deveaud2014)
LDA_result2 <- FindTopicsNumber(
  dtm,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("CaoJuan2009", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

fit models… done. calculate metrics: CaoJuan2009… done. Deveaud2014… done.


FindTopicsNumber_plot(LDA_result2)

8.1 Topic Modeling by Business, Month, and Type Using LDA

Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.


# Define Number of Topics
NumTopics = 4

# Perform LDA with a specified number of topics (e.g., 4 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))

# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")

# Extract top terms per topic and arrange them for display
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%   # Select the top 10 terms with the highest beta values for each topic
  arrange(topic, desc(beta))  # Arrange terms by topic and descending beta values

# Print the terms nicely
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print(n = Inf)  # Print all rows (no truncation)

9 A tibble: 4 × 2

topic terms

1 1 change, delivery, address, date, pm, time, singapore, blk, pl, assist
2 2 customer, cancel, refund, kindly, invoice, charge, item, assist, amount… 3 3 delivery, call, recipient, check, customer, deliver, sender, status, ha… 4 4 message, line, card, change, remove, street, add, family, unit, company


# Write Data to WD
write.csv(top_terms, file = paste(today, "TopTerms.csv"))

# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", ncol = 2) +
  coord_flip() +
  labs(title = "Top Terms per Topic",
       x = "Term",
       y = "Frequency") +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) + 
  scale_x_reordered()

# Print the Plot
Plot


# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 15, height = 10)

9.1 Topic Modelling Results

Extract the most frequent terms per topic and business. This should help to assign themes to topics.


# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)

# Format of the resulting object
attributes(tmResult)

$names [1] “terms” “topics”

nTerms(dtm)              # lengthOfVocab

[1] 11177


beta <- tmResult$terms   # get beta from results
dim(beta)

[1] 4 11177


rowSums(beta)            # rows in beta sum to 1

1 2 3 4 1 1 1 1


# Show 10 Most Frequent Terms per Topic
terms <- terms(lda_model, 20)

# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames

                             Topic 1 
   "change delivery address date pm" 
                             Topic 2

“customer cancel refund kindly invoice” Topic 3 “delivery call recipient check customer” Topic 4 “message line card change remove”


# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics 
dim(theta)

[1] 17348 4


# Write Data to WD
write.csv(terms, file = paste(today, "Terms.csv"))

9.2 Visualise Modelling Results

Purpose: To make the results accessible and actionable for stakeholders. Explanation: Various visualisations were created, such as bar charts, word clouds, and topic distributions, to present findings in an easily interpretable format. Each visualisation was tailored to highlight specific insights, such as topic importance or sentiment trends.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)

# Saving each plot as an image
for(topicToViz in 1:NumTopics)  {
  
  cat("\nWordcloud for topic ", topicToViz, "\n\n")
  
  # Visualize topics as word cloud
  topicToViz <- topicToViz # change for your own topic of interest
  
  # Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
  top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:40]
  words <- names(top40terms)
  
  # Extract the probabilites of each of the 40 terms
  probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
  
  # Visualize the terms as wordcloud
  mycolors <- brewer.pal(8, "Dark2")
  wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

  # Save the plot
  # ggsave(filename = paste("Word Cloud ", wordcloud, ".png", sep = ""), plot = p, width = 8, height = 5)
  
}

Wordcloud for topic 1

Wordcloud for topic 2

Wordcloud for topic 3

Wordcloud for topic 4

9.3 Visualise Topic Distribution within Documents

Identify the distribution of topics in documents, i.e., comments.


# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)

# Make Examples
exampleIds <- c(55, 62, 1111, 2224)

# Show sample comments
cat("\nShow sample documents, i.e., comments:\n\n")

Show sample documents, i.e., comments:


# Filter rows and select the Complaint column
SelectedComplaints <- TextData %>%
  filter(ID %in% exampleIds) %>%
  select(ID, Complaint)

print(SelectedComplaints)

ID

1 55 2 62 3 1111 4 2224 Complaint 1 assist change delivery address blk woodlands drive singapore recipient hp ms siew eng recipient discharged hospital 2 status assist check delivered yesterday night update 3 change date friday thursday 4 assist give customer call apologise proceed refund


N <- length(exampleIds)


# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]

colnames(topicProportionExamples) <- topicNames

vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)), 
                     variable.name = "topic", id.vars = "document")  

Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("55" = "lightgreen", "62" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
                    labels = c("Com 55", "Com 62", "Com 1111", "Com 2224", "Com 3334"),
                    name = "Comment") +  # Custom legend title and labels
  facet_wrap(~ document, ncol = 4) +
    labs(title = paste("Topic Proportions in Sample Comments"),
         x = NULL, y = "Topic") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
    strip.text = element_text(size = 14, face = "bold", color = "blue")  # Custom facet label styling
  ) 

Plot


# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".jpg", sep = ""), plot = Plot, width = 15, height = 9)

10 Emotion Analysis by Business, Month, and Type

The NRC lexicon assigns words to specific emotions such as anger, joy, sadness, and more, along with general positive or negative sentiment. This step ensures that the emotions are defined for words in the tokenized text data. Each word in the dataset is matched with its corresponding emotion. Words are aggregated by Business, Month, and Type. This allows the data to be reshaped, showing how often each emotion appears within different categories. Reshaping the data into a long format is essential for visualization with ggplot2. The Month is also formatted into a standard date type for proper chronological ordering in plots. Using str_to_title() ensures consistent capitalization of emotion labels. Assigning distinct colors for each emotion enhances the readability of the visualisations. Grouping similar emotions (e.g., red shades for negative emotions, green shades for positive ones) could improve clarity.


# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")

# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
  inner_join(nrc_lexicon, by = "word") %>%
  count(Business, Month, Type, sentiment) %>%
  spread(sentiment, n, fill = 0)

# Reshape data for plotting
emotion_long <- emotions %>%
  gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
  mutate(Month = ymd(Month))  %>%   # Requires lubridate package
  mutate(emotion = str_to_title(emotion))

# Define custom colors for each emotion
emotion_colors <- c(
  "Anger" = "firebrick",
  "Anticipation" = "lightgrey",
  "Disgust" = "brown",
  "Fear" = "tomato",
  "Joy" = "lightgreen",
  "Negative" = "indianred",
  "Positive" = "forestgreen",
  "Sadness" = "salmon",
  "Surprise" = "palegreen",
  "Trust" = "darkolivegreen"
)

# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ Type, scales = "free") +
  labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
       x = NULL, y = "Emotion Count") +
  theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 


# Unique Types in the data
unique_types <- unique(emotion_long$Type)

# Loop over each type and create an independent plot
for (type in unique_types) {
  
  # Filter data for the current Type
  data_subset <- emotion_long %>% filter(Type == type)
  
  # Plot for the current Type
  Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
    geom_bar(stat = "identity", position = "stack") +
    scale_fill_manual(values = emotion_colors, name = "Emotion") +  # Assign custom colors
    labs(title = paste("Emotion Analysis for Type:", type),
         x = NULL, y = "Emotion Count") +
    theme_minimal() +
   theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
  ) 

  # Print the plot
  print(Plot) 

  # Save the plot
  ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
  
}

Text Mining Analysing Text Data

Updated by Uwe H Kaufmann on 28 Jan 2025, 10:09