# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Global Settings
options(digits = 4)
options(scipen = 999)
setwd("C:/Users/uwe/OneDrive/Documents/AD Data/TextMining/R")
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
Purpose: To ensure all data is ready for analysis without missing or incompatible entries. Explanation: This phase involves gathering and loading the data into R for processing. The dataset, presumably containing text data like customer comments or feedback, was imported into the R environment using read.csv(). Any encoding issues were handled to ensure compatibility. Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.
# Get Your Text Data and Save Them in TextData
# Add ID
TextData <- TextDataRaw[, -2]
TextData <- TextData[order(TextData$EnqDate),]
TextData$ID <- seq.int(nrow(TextData))
# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))
Purpose: To prepare the text data for analysis by removing noise and ensuring that only meaningful content is retained. Explanation: Non-ASCII characters, punctuation, and numbers were removed.Text was converted to lowercase to ensure consistency. Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.
# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")
# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)
# Omitting any NA values
TextData <- TextData %>%
filter(!is.na(Complaint)) %>%
mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
mutate(Type = str_to_title(Type)) # Convert to titlecase
# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)
# Extract month (in "Year-Month" format)
TextData <- TextData %>%
mutate(Month = floor_date(EnqDate, "month"))
# Generate the summary
df_summary <- dfSummary(TextData)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
TextData
Dimensions: 23801 x 7
Duplicates: 0
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 16908 (71.0%) 6893 (29.0%) | IIIIIIIIIIIIII IIIII | 23801 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d | 314 distinct values | : : : . : : . . . : : : : : : . : : : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Credit Card 4. Delivery Status 5. Feedback 6. Not Delivered 7. Others | 1618 ( 6.8%) 9332 (39.2%) 26 ( 0.1%) 993 ( 4.2%) 3419 (14.4%) 2681 (11.3%) 5732 (24.1%) | I IIIIIII | 23801 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. (Empty string) 2. - 3. 0 4. 3189452 5. 3189100 6. 3217360 7. 5159916 8. 5160062 9. 5172125 10. 3217115 [ 18620 others ] | 795 ( 3.3%) 138 ( 0.6%) 77 ( 0.3%) 38 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 24 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 15 ( 0.1%) 22629 (95.1%) | 23801 (100.0%) | 0 (0.0%) | |
5 | Complaint [character] | 1. no information 2. pdo - not in 3. pdo - shifted 4. pdo- not in 5. pdo - shifted. 6. pdo - closed 7. pdo - no such person 8. pls update delivery statu 9. pdo - discharged 10. pdo - closed. [ 16116 others ] | 5758 (24.2%) 357 ( 1.5%) 177 ( 0.7%) 124 ( 0.5%) 78 ( 0.3%) 60 ( 0.3%) 47 ( 0.2%) 41 ( 0.2%) 37 ( 0.2%) 35 ( 0.1%) 17087 (71.8%) | IIII | 23801 (100.0%) | 0 (0.0%) |
6 | ID [integer] | Mean (sd) : 11901 (6871) min < med < max: 1 < 11901 < 23801 IQR (CV) : 11900 (0.6) | 23801 distinct values (Integer sequence) | : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
7 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . . : . . : . : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
# Show Characteristics of Data Frame
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Punctuation, numbers, stopwords (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Any comments marked as “no information” or with NA values were excluded.
# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint)
TextData$Complaint <- removePunctuation(TextData$Complaint)
TextData$Complaint <- removeNumbers(TextData$Complaint)
TextData <- TextData %>%
filter(Complaint != "no information") %>%
mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))
# Define custom stopwords (e.g., names)
custom_stopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names
# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
unnest_tokens(word, Complaint) %>%
filter(!word %in% custom_stopwords) %>%
filter(!str_detect(word, date_pattern)) %>%
filter(!str_detect(word, name_pattern)) %>%
count(word, sort = TRUE)
Purpose: To enable word-level analysis, which is crucial for frequency analysis, sentiment analysis, and topic modeling. Explanation: Text data was split into individual words (tokens) using unnest_tokens(). This step transforms unstructured text into structured data, making it easier to analyse.
# Loading other packages if not available
if(! "textstem" %in% installed.packages()) { install.packages("textstem", dependencies = TRUE) }
library(textstem)
# Tokenize text data and remove stop words
tokens <- TextData %>%
unnest_tokens(word, Complaint) %>%
anti_join(stop_words) %>%
filter(!is.na(word))
# Lemmatise Tokens
tokensOld <- tokens
tokens$word <- lemmatize_words(tokens$word)
# Join tokens with stratified data
tokens <- tokens %>%
inner_join(TextData %>% select(ID, Business, Month, Type), by = "ID") %>%
select(-Business.y, -Month.y, -Type.y) %>%
rename(
Business = Business.x,
Month = Month.x,
Type = Type.x
)
NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")
cat("\nThis dataset includes:\n")
This dataset includes:
cat("- Number of raw comments:", NumRawComments,"\n")
cat("- Number of clean comments:", NumComments,"\n")
cat("- Number of tokens:", NumTokens,"\n")
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
# Generate the summary
df_summary <- dfSummary(tokens)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
tokens
Dimensions: 225957 x 7
Duplicates: 47293
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 160584 (71.1%) 65373 (28.9%) | IIIIIIIIIIIIII IIIII | 225957 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d | 312 distinct values | : : . . : : . . . : : : : : . : : : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered | 11675 ( 5.2%) 142776 (63.2%) 9029 ( 4.0%) 50963 (22.6%) 11514 ( 5.1%) | I IIIIIIIIIIII | 225957 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. 0 2. 3189452 3. (Empty string) 4. 4203001839 5. 7101450475 6. 7101462612 7. 7101452379 8. 3221733 9. 4101079583 10. 3217266 [ 15067 others ] | 515 ( 0.2%) 220 ( 0.1%) 173 ( 0.1%) 159 ( 0.1%) 153 ( 0.1%) 153 ( 0.1%) 135 ( 0.1%) 134 ( 0.1%) 133 ( 0.1%) 129 ( 0.1%) 224053 (99.2%) | 225957 (100.0%) | 0 (0.0%) | |
5 | ID [integer] | Mean (sd) : 11921 (6835) min < med < max: 3 < 11909 < 23787 IQR (CV) : 11789 (0.6) | 17990 distinct values | . . . . : : . . . . : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
6 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . : . . : . : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
7 | word [character] | 1. delivery 2. date 3. msg 4. pm 5. change 6. customer 7. not 8. message 9. line 10. card [ 11179 others ] | 24079 (10.7%) 20477 ( 9.1%) 9958 ( 4.4%) 7649 ( 3.4%) 6914 ( 3.1%) 3956 ( 1.8%) 3771 ( 1.7%) 3307 ( 1.5%) 2810 ( 1.2%) 2769 ( 1.2%) 140267 (62.1%) | II I | 225957 (100.0%) | 0 (0.0%) |
Purpose: To gain initial insights into the dataset and identify patterns or anomalies. Explanation: Summary Statistics: The dataset’s structure, number of entries, and date range were explored. Visualisation: Bar charts and Pareto charts were used to highlight common trends, such as the frequency of different comment types. Data frame is explored by showing available columns, structure and first five rows of data.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Attempt to remove any problematic characters, including control and special characters
TextData <- TextData %>%
mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = "")) # Removes invalid characters
TextData$EnqDate <- as.Date(TextData$EnqDate)
TextData$Month <- as.Date(TextData$Month)
# Generate the summary
df_summary <- dfSummary(TextData)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
TextData
Dimensions: 18043 x 7
Duplicates: 0
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 12671 (70.2%) 5372 (29.8%) | IIIIIIIIIIIIII IIIII | 18043 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-04 max : 2023-07-06 range : 11m 29d | 312 distinct values | : : . . : : . : : : . : . : : : : : : : : : : | 18043 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered | 1618 ( 9.0%) 9332 (51.7%) 993 ( 5.5%) 3419 (18.9%) 2681 (14.9%) | I IIIIIIIIII I III II | 18043 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. 3189452 2. 0 3. 3189100 4. 3217360 5. 5159916 6. 5160062 7. (Empty string) 8. 5172125 9. 3212425 10. 3221869 [ 15086 others ] | 38 ( 0.2%) 32 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 23 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 17 ( 0.1%) 14 ( 0.1%) 14 ( 0.1%) 17820 (98.8%) | 18043 (100.0%) | 0 (0.0%) | |
5 | Complaint [character] | 1. pdo not in 2. pdo shifted 3. pdo closed 4. pdo no such person 5. pdo cannot locate 6. amount should be sap capt 7. pdo no such address 8. pdo discharged 9. not in 10. pdo not in x [ 15293 others ] | 539 ( 3.0%) 329 ( 1.8%) 123 ( 0.7%) 73 ( 0.4%) 72 ( 0.4%) 64 ( 0.4%) 57 ( 0.3%) 49 ( 0.3%) 48 ( 0.3%) 44 ( 0.2%) 16645 (92.3%) | 18043 (100.0%) | 0 (0.0%) | |
6 | ID [character] | 1. 10 2. 1000 3. 10000 4. 10001 5. 10002 6. 10003 7. 10004 8. 10005 9. 10006 10. 10007 [ 18033 others ] | 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 18033 (99.9%) | 18043 (100.0%) | 0 (0.0%) | |
7 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . : : . . : : : : : : : | 18043 (100.0%) | 0 (0.0%) |
# Show Characteristics of Tokens
knitr::kable(head(tokens), format = "html", caption = "Sample Data from Tokens") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | ID | Month | word |
---|---|---|---|---|---|---|
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | delivery |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | date |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | delivery |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | date |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | msg |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | kindly |
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Purpose: To assess the emotional tone of the text data, identifying areas with strong positive or negative sentiments. Explanation: Sentiment lexicons like Bing or NRC were used to classify words as positive, negative, or associated with specific emotions. Sentiment scores were calculated and visualized over time and by business type. The result of positive and negative sentiments by business and month is printed in bar charts.
# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")
# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
inner_join(bing_lexicon, by = "word") %>%
mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
group_by(Business, Month, Type) %>%
summarise(sentiment_score = sum(score)) %>%
ungroup()
# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "dodge") + # Stacked bars for each Business within each Month
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)
# Improved sentiment visualization with stacked bars by sentiment type
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "stack") + # Stack positive and negative sentiments in each bar
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)
The number of comments is stratified by comment types and shown in a Pareto chart.
# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
count(Type, sort = TRUE) %>%
mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)
# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") + # Bars for frequency
geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1), # Line for cumulative %
color = "red", size = 1) +
geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") + # Points on the line
scale_y_continuous(
sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage") # Secondary y-axis for %
) +
labs(title = "Pareto Chart of Comment Types",
x = NULL,
y = "Frequency") +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)
Purpose: To understand the key terms or themes in the dataset and their relevance across different categories. Explanation: The most frequently occurring words were identified and visualised. The most common words in comments are extracted by business and comment type and displayed in bar charts.
# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
group_by(Business, Type, word) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
# Visualize top words by stratified groups
top_words <- word_freq %>%
group_by(Business, Type) %>%
ungroup()
# Filter for top 10 words across all types
top_10_words <- top_words %>%
group_by(Type) %>%
slice_max(order_by = freq, n = 10) %>% # Select the top 10 words by frequency within each Type
ungroup()
# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
geom_col(show.legend = TRUE) +
facet_wrap(~ Type, scales = "free", ncol = 2) +
coord_flip() +
labs(title = "Top Words by Business and Type",
x = "Words", y = "Frequency") +
scale_x_reordered() +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Word Frequency Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)
cat("\nShow Top 10 Words Table\n")
Show Top 10 Words Table
knitr::kable(head(top_10_words), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | Type | word | freq |
---|---|---|---|
Division A | Cancel | cancel | 829 |
Division B | Cancel | cancel | 559 |
Division A | Cancel | customer | 384 |
Division A | Cancel | reprocess | 359 |
Division A | Cancel | line | 305 |
Division A | Cancel | item | 240 |
cat("\nShow Top 10 Terms Table\n")
Show Top 10 Terms Table
knitr::kable(head(terms), format = "html", caption = "Sample Data from TextData") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
new(“standardGeneric”, .Data = function (x, …) |
standardGeneric(“terms”), generic = structure(“terms”, package = “stats”), |
package = “stats”, group = list(), valueClass = character(0), |
signature = “x”, default = new(“derivedDefaultMethod”, .Data = function (x, |
…) |
UseMethod(“terms”), target = new(“signature”, .Data = “ANY”, |
Purpose: To uncover the underlying themes or categories within the dataset, providing a high-level view of the content. Explanation: Latent Dirichlet Allocation (LDA) was applied to discover hidden topics within the text. The optimal number of topics was determined using metrics like Griffiths2004 and CaoJuan2009. The most frequent terms within each topic were extracted to assign meaningful labels to topics. ## Identifying the Best Number of Topics for LDA following Arun (2010), CaoJuan (2009), Deveaud (2014), Griffiths (2004) Latent Dirichlet Allocation (LDA) is used to discover hidden topics. Firstly, the number of topics needs to be decided based on indicators by (2010), CaoJuan (2009), Deveaud (2014) and Griffiths (2004). Arjun and CaoJuan should be minimised, whereas indicators by Deveaud and Griffiths should be maximised.
# Loading other packages if not available
if(! "ldatuning" %in% installed.packages()) { install.packages("ldatuning", dependencies = TRUE) }
library(ldatuning)
# Create Document-Term Matrix by stratified groups
dtm <- tokens %>%
count(ID, word) %>%
cast_dtm(ID, word, n)
# Identify number of topics
LDA_result <- FindTopicsNumber(
dtm,
topics = seq(from = 2, to = 15, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
fit models… done. calculate metrics: Griffiths2004… done. CaoJuan2009… done. Arun2010… done. Deveaud2014… done.
FindTopicsNumber_plot(LDA_result)
Secondly, Latent Dirichlet Allocation (LDA) is conducted with the number of topics derived in the previous step. For practicality reasons, a topic number of six is chosen for further analysis.
# Define Number of Topics
NumTopics = 6
# Perform LDA with a specified number of topics (e.g., 3 topics)
lda_model <- LDA(dtm, k = NumTopics, control = list(seed = 1234))
# Extract top terms per topic
topics <- tidy(lda_model, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup()
# Visualize the top terms in each topic
Plot <- ggplot(top_terms, aes(x = reorder_within(term, beta, topic), y = beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free", ncol = 3) +
coord_flip() +
labs(title = "Top Terms per Topic",
x = "Term",
y = "Frequency") +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
) +
scale_x_reordered()
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Top Terms in each Topic", ".png", sep = ""), plot = Plot, width = 15, height = 10)
Extract the most frequent terms per topic and business. This should help to assign themes to topics.
# Have a look at some of the results (posterior distributions)
tmResult <- posterior(lda_model)
# Format of the resulting object
attributes(tmResult)
$names [1] “terms” “topics”
nTerms(dtm) # lengthOfVocab
[1] 11189
beta <- tmResult$terms # get beta from results
dim(beta)
[1] 6 11189
rowSums(beta) # rows in beta sum to 1
1 2 3 4 5 6 1 1 1 1 1 1
# Show 10 Most Frequent Terms per Topic
terms(lda_model, 10)
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
[1,] “delivery” “message” “customer” “call” “delivery”
“delivery”
[2,] “date” “date” “cancel” “check” “date” “date”
[3,] “line” “delivery” “invoice” “customer” “pm” “not”
[4,] “msg” “card” “refund” “recipient” “msg” “pdo”
[5,] “address” “msg” “kindly” “sender” “change” “msg”
[6,] “change” “change” “charge” “status” “time” “change”
[7,] “remove” “family” “item” “delivery” “recipient” “instruction” [8,]
“pte” “wish” “due” “deliver” “assist” “add”
[9,] “street” “happy” “amount” “hamper” “sender” “shift”
[10,] “singapore” “dear” “assist” “receive” “pl” “special”
# Give Topics Names
top5termsPerTopic <- terms(lda_model, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
topicNames
Topic 1 Topic 2
"delivery date line msg address" "message date delivery card msg"
Topic 3 Topic 4
“customer cancel invoice refund kindly” “call check customer recipient sender” Topic 5 Topic 6 “delivery date pm msg change” “delivery date not pdo msg”
# For every document we have a probability distribution of its contained topics
theta <- tmResult$topics
dim(theta)
[1] 17990 6
Purpose: To make the results accessible and actionable for stakeholders. Explanation: Various visualisations were created, such as bar charts, word clouds, and topic distributions, to present findings in an easily interpretable format. Each visualisation was tailored to highlight specific insights, such as topic importance or sentiment trends.
# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
# Saving each plot as an image
for(topicToViz in 1:NumTopics) {
# Visualize topics as word cloud
topicToViz <- topicToViz # change for your own topic of interest
# Select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:50]
words <- names(top40terms)
# Extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:50]
# Visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)
# Save the plot
# ggsave(filename = paste("Word Cloud ", wordcloud, ".png", sep = ""), plot = p, width = 8, height = 5)
}
Identify the distribution of topics in documents, i.e., comments.
# Loading other packages if not available
if(! "RColorBrewer" %in% installed.packages()) { install.packages("RColorBrewer", dependencies = TRUE) }
library(RColorBrewer)
if(! "wordcloud" %in% installed.packages()) { install.packages("wordcloud", dependencies = TRUE) }
library(wordcloud)
if(! "reshape2" %in% installed.packages()) { install.packages("reshape2", dependencies = TRUE) }
library(reshape2)
# Convert TextData$Complaint to a data frame if it's not already
textdata <- data.frame(text = TextData$Complaint, doc_id = TextData$ID, stringsAsFactors = FALSE)
# Load stopwords
english_stopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")
# Create corpus object
corpus <- Corpus(DataframeSource(textdata))
# Preprocessing chain
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, english_stopwords)
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stemDocument, language = "en")
processedCorpus <- tm_map(processedCorpus, stripWhitespace)
# Make Examples
exampleIds <- c(2, 222, 1111, 2224, 3334)
# Show sample comments
# cat("\nShow sample documents, i.e., comments:\n\n")
# lapply(corpus[exampleIds], as.character)
N <- length(exampleIds)
# Get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]
colnames(topicProportionExamples) <- topicNames
vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(exampleIds)),
variable.name = "topic", id.vars = "document")
Plot <- ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_manual(values = c("2" = "lightgreen", "222" = "lightblue", "1111" = "pink", "2224" = "brown", "3334" = "orange"),
labels = c("Com 2", "Com 222", "Com 1111", "Com 2224", "Com 3334"),
name = "Comment") + # Custom legend title and labels
facet_wrap(~ document, ncol = 5) +
labs(title = paste("Topic Proportions in Sample Comments"),
x = NULL, y = "Topic") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
)
# Save the plot
ggsave(filename = paste("Topic Proportion Analysis", ".png", sep = ""), plot = Plot, width = 8, height = 5)
The NRC lexicon assigns words to specific emotions such as anger, joy, sadness, and more, along with general positive or negative sentiment. This step ensures that the emotions are defined for words in the tokenized text data. Each word in the dataset is matched with its corresponding emotion. Words are aggregated by Business, Month, and Type. This allows the data to be reshaped, showing how often each emotion appears within different categories. Reshaping the data into a long format is essential for visualization with ggplot2. The Month is also formatted into a standard date type for proper chronological ordering in plots. Using str_to_title() ensures consistent capitalization of emotion labels. Assigning distinct colors for each emotion enhances the readability of the visualisations. Grouping similar emotions (e.g., red shades for negative emotions, green shades for positive ones) could improve clarity.
# Load NRC sentiment lexicon
nrc_lexicon <- get_sentiments("nrc")
# Join tokens with NRC lexicon to get emotions
emotions <- tokens %>%
inner_join(nrc_lexicon, by = "word") %>%
count(Business, Month, Type, sentiment) %>%
spread(sentiment, n, fill = 0)
# Reshape data for plotting
emotion_long <- emotions %>%
gather(key = "emotion", value = "count", -Business, -Month, -Type) %>%
mutate(Month = ymd(Month)) %>% # Requires lubridate package
mutate(emotion = str_to_title(emotion))
# Define custom colors for each emotion
emotion_colors <- c(
"Anger" = "firebrick",
"Anticipation" = "lightgrey",
"Disgust" = "brown",
"Fear" = "tomato",
"Joy" = "lightgreen",
"Negative" = "indianred",
"Positive" = "forestgreen",
"Sadness" = "salmon",
"Surprise" = "palegreen",
"Trust" = "darkolivegreen"
)
# Plot emotions per type
Plot <- ggplot(emotion_long, aes(x = Month, y = count, fill = emotion)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ Type, scales = "free") +
labs(title = "Emotion Analysis by Business and Month (Faceted by Type)",
x = NULL, y = "Emotion Count") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Unique Types in the data
unique_types <- unique(emotion_long$Type)
# Loop over each type and create an independent plot
for (type in unique_types) {
# Filter data for the current Type
data_subset <- emotion_long %>% filter(Type == type)
# Plot for the current Type
Plot <- ggplot(data_subset, aes(x = Month, y = count, fill = emotion)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = emotion_colors, name = "Emotion") + # Assign custom colors
labs(title = paste("Emotion Analysis for Type:", type),
x = NULL, y = "Emotion Count") +
theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the plot
print(Plot)
# Save the plot
ggsave(filename = paste("Emotion_Analysis_Type_", type, ".png", sep = ""), plot = Plot, width = 10, height = 6)
}