# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidytext" %in% installed.packages()) { install.packages("tidytext", dependencies = TRUE) }
library(tidytext)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "textclean" %in% installed.packages()) { install.packages("textclean", dependencies = TRUE) }
library(textclean)
if(! "tm" %in% installed.packages()) { install.packages("tm", dependencies = TRUE) }
library(tm)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "lubridate" %in% installed.packages()) { install.packages("lubridate", dependencies = TRUE) }
library(lubridate)
if(! "topicmodels" %in% installed.packages()) { install.packages("topicmodels", dependencies = TRUE) }
library(topicmodels)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
if(! "kableExtra" %in% installed.packages()) { install.packages("kableExtra", dependencies = TRUE) }
library(kableExtra)
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Global Settings
options(digits = 4)
options(scipen = 999)
setwd("C:/Users/uwe/OneDrive/Documents/AD Data/TextMining/R")
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
Purpose: To ensure all data is ready for analysis without missing or incompatible entries. Explanation: This phase involves gathering and loading the data into R for processing. The dataset, presumably containing text data like customer comments or feedback, was imported into the R environment using read.csv(). Any encoding issues were handled to ensure compatibility. Data is retrieved from URL. Original ID number is replaced by new ID number in Date sequence. Amended data file is saved.
# Get Your Text Data and Save Them in TextData
# Add ID
TextData <- TextDataRaw[, -2]
TextData <- TextData[order(TextData$EnqDate),]
TextData$ID <- seq.int(nrow(TextData))
# Write Data to WD
write.csv(TextData, file = paste(today, "TextData.csv"))
Purpose: To prepare the text data for analysis by removing noise and ensuring that only meaningful content is retained. Explanation: Non-ASCII characters, punctuation, and numbers were removed.Text was converted to lowercase to ensure consistency. Encoding of text data in UTF-8 ensures compatibility with all tools. All text data is turned into lower case. Date columns is turned into R date format. Month column is created.
# Loading other packages if not available
if(! "glue" %in% installed.packages()) { install.packages("glue", dependencies = TRUE) }
library(glue)
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Convert the string to UTF-8 encoding
string_utf8 <- iconv(TextData$Complaint, to = "UTF-8", sub = "")
# Remove multibyte characters
TextData$Complaint <- gsub("[^\x01-\x7F]", "", string_utf8)
# Omitting any NA values
TextData <- TextData %>%
filter(!is.na(Complaint)) %>%
mutate(Complaint = tolower(Complaint)) %>% # Convert to lowercase
mutate(Type = str_to_title(Type)) # Convert to titlecase
# Ensure your date format is consistent
TextData$EnqDate <- as.Date(TextData$EnqDate)
# Extract month (in "Year-Month" format)
TextData <- TextData %>%
mutate(Month = floor_date(EnqDate, "month"))
# Generate the summary
df_summary <- dfSummary(TextData)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
TextData
Dimensions: 23801 x 7
Duplicates: 0
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 16908 (71.0%) 6893 (29.0%) | IIIIIIIIIIIIII IIIII | 23801 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d | 314 distinct values | : : : . : : . . . : : : : : : . : : : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Credit Card 4. Delivery Status 5. Feedback 6. Not Delivered 7. Others | 1618 ( 6.8%) 9332 (39.2%) 26 ( 0.1%) 993 ( 4.2%) 3419 (14.4%) 2681 (11.3%) 5732 (24.1%) | I IIIIIII | 23801 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. (Empty string) 2. - 3. 0 4. 3189452 5. 3189100 6. 3217360 7. 5159916 8. 5160062 9. 5172125 10. 3217115 [ 18620 others ] | 795 ( 3.3%) 138 ( 0.6%) 77 ( 0.3%) 38 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 24 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 15 ( 0.1%) 22629 (95.1%) | 23801 (100.0%) | 0 (0.0%) | |
5 | Complaint [character] | 1. no information 2. pdo - not in 3. pdo - shifted 4. pdo- not in 5. pdo - shifted. 6. pdo - closed 7. pdo - no such person 8. pls update delivery statu 9. pdo - discharged 10. pdo - closed. [ 16116 others ] | 5758 (24.2%) 357 ( 1.5%) 177 ( 0.7%) 124 ( 0.5%) 78 ( 0.3%) 60 ( 0.3%) 47 ( 0.2%) 41 ( 0.2%) 37 ( 0.2%) 35 ( 0.1%) 17087 (71.8%) | IIII | 23801 (100.0%) | 0 (0.0%) |
6 | ID [integer] | Mean (sd) : 11901 (6871) min < med < max: 1 < 11901 < 23801 IQR (CV) : 11900 (0.6) | 23801 distinct values (Integer sequence) | : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
7 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . . : . . : . : : : : : : : : | 23801 (100.0%) | 0 (0.0%) |
# Show Characteristics of Data Frame
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Punctuation, numbers, stopwords (common words like ‘the’, ‘and’) and names are removed. All comments are split into initial words, t.e., tokens. Any comments marked as “no information” or with NA values were excluded.
# Example of text cleaning
TextData$Complaint <- str_replace_all(TextData$Complaint, '[[:punct:]]', ' ') # remove punctuation
TextData$Complaint <- str_to_lower(TextData$Complaint) # convert to lowercase
TextData$Complaint <- replace_non_ascii(TextData$Complaint)
TextData$Complaint <- removePunctuation(TextData$Complaint)
TextData$Complaint <- removeNumbers(TextData$Complaint)
TextData <- TextData %>%
filter(Complaint != "no information") %>%
mutate(Complaint = str_replace_all(Complaint, " {2,}", " "))
# Define custom stopwords (e.g., names)
custom_stopwords <- c(stopwords("en"), "irene", "thomas", "mathew")
date_pattern <- "\\b\\d{1,2}[ /.-]\\d{1,2}[ /.-]\\d{2,4}\\b"
name_pattern <- "\\b[A-Z][a-z]*\\b" # Simple pattern to match names
# Perform text analysis excluding stopwords and names
TextDatafreq <- TextData %>%
unnest_tokens(word, Complaint) %>%
filter(!word %in% custom_stopwords) %>%
filter(!str_detect(word, date_pattern)) %>%
filter(!str_detect(word, name_pattern)) %>%
count(word, sort = TRUE)
Purpose: To enable word-level analysis, which is crucial for frequency analysis, sentiment analysis, and topic modeling. Explanation: Text data was split into individual words (tokens) using unnest_tokens(). This step transforms unstructured text into structured data, making it easier to analyse.
# Loading other packages if not available
if(! "textstem" %in% installed.packages()) { install.packages("textstem", dependencies = TRUE) }
library(textstem)
# Tokenize text data and remove stop words
tokens <- TextData %>%
unnest_tokens(word, Complaint) %>%
anti_join(stop_words) %>%
filter(!is.na(word))
# Lemmatise Tokens
tokensOld <- tokens
tokens$word <- lemmatize_words(tokens$word)
# Join tokens with stratified data
tokens <- tokens %>%
inner_join(TextData %>% select(ID, Business, Month, Type), by = "ID") %>%
select(-Business.y, -Month.y, -Type.y) %>%
rename(
Business = Business.x,
Month = Month.x,
Type = Type.x
)
NumRawComments <- format(as.numeric(count(TextDataRaw)), big.mark=",", scientific = F)
NumComments <- format(as.numeric(count(TextData)), big.mark=",", scientific = F)
NumTokens <- format(as.numeric(count(tokens)), big.mark=",", scientific = F)
MinDate <- format(min(TextData$EnqDate), "%d %b %Y")
MaxDate <- format(max(TextData$EnqDate), "%d %b %Y")
cat("\nThis dataset includes:\n")
This dataset includes:
cat("- Number of raw comments:", NumRawComments,"\n")
cat("- Number of clean comments:", NumComments,"\n")
cat("- Number of tokens:", NumTokens,"\n")
cat("- Comments between:", MinDate, " to ", MaxDate,"\n")
# Generate the summary
df_summary <- dfSummary(tokens)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
tokens
Dimensions: 225957 x 7
Duplicates: 47293
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 160584 (71.1%) 65373 (28.9%) | IIIIIIIIIIIIII IIIII | 225957 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-03 max : 2023-07-06 range : 11m 29d | 312 distinct values | : : . . : : . . . : : : : : . : : : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered | 11675 ( 5.2%) 142776 (63.2%) 9029 ( 4.0%) 50963 (22.6%) 11514 ( 5.1%) | I IIIIIIIIIIII | 225957 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. 0 2. 3189452 3. (Empty string) 4. 4203001839 5. 7101450475 6. 7101462612 7. 7101452379 8. 3221733 9. 4101079583 10. 3217266 [ 15067 others ] | 515 ( 0.2%) 220 ( 0.1%) 173 ( 0.1%) 159 ( 0.1%) 153 ( 0.1%) 153 ( 0.1%) 135 ( 0.1%) 134 ( 0.1%) 133 ( 0.1%) 129 ( 0.1%) 224053 (99.2%) | 225957 (100.0%) | 0 (0.0%) | |
5 | ID [integer] | Mean (sd) : 11921 (6835) min < med < max: 3 < 11909 < 23787 IQR (CV) : 11789 (0.6) | 17990 distinct values | . . . . : : . . . . : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
6 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . : . . : . : : : : : : : : | 225957 (100.0%) | 0 (0.0%) |
7 | word [character] | 1. delivery 2. date 3. msg 4. pm 5. change 6. customer 7. not 8. message 9. line 10. card [ 11179 others ] | 24079 (10.7%) 20477 ( 9.1%) 9958 ( 4.4%) 7649 ( 3.4%) 6914 ( 3.1%) 3956 ( 1.8%) 3771 ( 1.7%) 3307 ( 1.5%) 2810 ( 1.2%) 2769 ( 1.2%) 140267 (62.1%) | II I | 225957 (100.0%) | 0 (0.0%) |
Purpose: To gain initial insights into the dataset and identify patterns or anomalies. Explanation: Summary Statistics: The dataset’s structure, number of entries, and date range were explored. Visualisation: Bar charts and Pareto charts were used to highlight common trends, such as the frequency of different comment types. Data frame is explored by showing available columns, structure and first five rows of data.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Attempt to remove any problematic characters, including control and special characters
TextData <- TextData %>%
mutate_all(~ iconv(., from = "UTF-8", to = "UTF-8", sub = "")) # Removes invalid characters
TextData$EnqDate <- as.Date(TextData$EnqDate)
TextData$Month <- as.Date(TextData$Month)
# Generate the summary
df_summary <- dfSummary(TextData)
# View the summary in the RStudio Viewer or browser
print(df_summary)
Data Frame Summary
TextData
Dimensions: 18043 x 7
Duplicates: 0
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Business [character] | 1. Division A 2. Division B | 12671 (70.2%) 5372 (29.8%) | IIIIIIIIIIIIII IIIII | 18043 (100.0%) | 0 (0.0%) |
2 | EnqDate [Date] | min : 2022-07-07 med : 2023-02-04 max : 2023-07-06 range : 11m 29d | 312 distinct values | : : . . : : . : : : . : . : : : : : : : : : : | 18043 (100.0%) | 0 (0.0%) |
3 | Type [character] | 1. Cancel 2. Changes 3. Delivery Status 4. Feedback 5. Not Delivered | 1618 ( 9.0%) 9332 (51.7%) 993 ( 5.5%) 3419 (18.9%) 2681 (14.9%) | I IIIIIIIIII I III II | 18043 (100.0%) | 0 (0.0%) |
4 | Order [character] | 1. 3189452 2. 0 3. 3189100 4. 3217360 5. 5159916 6. 5160062 7. (Empty string) 8. 5172125 9. 3212425 10. 3221869 [ 15086 others ] | 38 ( 0.2%) 32 ( 0.2%) 25 ( 0.1%) 25 ( 0.1%) 23 ( 0.1%) 18 ( 0.1%) 17 ( 0.1%) 17 ( 0.1%) 14 ( 0.1%) 14 ( 0.1%) 17820 (98.8%) | 18043 (100.0%) | 0 (0.0%) | |
5 | Complaint [character] | 1. pdo not in 2. pdo shifted 3. pdo closed 4. pdo no such person 5. pdo cannot locate 6. amount should be sap capt 7. pdo no such address 8. pdo discharged 9. not in 10. pdo not in x [ 15293 others ] | 539 ( 3.0%) 329 ( 1.8%) 123 ( 0.7%) 73 ( 0.4%) 72 ( 0.4%) 64 ( 0.4%) 57 ( 0.3%) 49 ( 0.3%) 48 ( 0.3%) 44 ( 0.2%) 16645 (92.3%) | 18043 (100.0%) | 0 (0.0%) | |
6 | ID [character] | 1. 10 2. 1000 3. 10000 4. 10001 5. 10002 6. 10003 7. 10004 8. 10005 9. 10006 10. 10007 [ 18033 others ] | 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 1 ( 0.0%) 18033 (99.9%) | 18043 (100.0%) | 0 (0.0%) | |
7 | Month [Date] | min : 2022-07-01 med : 2023-02-01 max : 2023-07-01 range : 1y 0m 0d | 13 distinct values | : : : . : : . . : : : : : : : | 18043 (100.0%) | 0 (0.0%) |
# Show Characteristics of Tokens
knitr::kable(head(tokens), format = "html", caption = "Sample Data from Tokens") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Business | EnqDate | Type | Order | ID | Month | word |
---|---|---|---|---|---|---|
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | delivery |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | date |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | delivery |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | date |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | msg |
Division A | 2022-07-07 | Changes | 7101436529 | 3 | 2022-07-01 | kindly |
# knitr::kable(head(TextData), format = "html", caption = "Sample Data from TextData") %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Purpose: To assess the emotional tone of the text data, identifying areas with strong positive or negative sentiments. Explanation: Sentiment lexicons like Bing or NRC were used to classify words as positive, negative, or associated with specific emotions. Sentiment scores were calculated and visualized over time and by business type. The result of positive and negative sentiments by business and month is printed in bar charts.
# Load Bing sentiment lexicon
bing_lexicon <- get_sentiments("bing")
# Join tokens with Bing sentiments and calculate scores
sentiment_scores <- tokens %>%
inner_join(bing_lexicon, by = "word") %>%
mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
group_by(Business, Month, Type) %>%
summarise(sentiment_score = sum(score)) %>%
ungroup()
# Example of improved sentiment visualization
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "dodge") + # Stacked bars for each Business within each Month
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 1", ".png", sep = ""), plot = Plot, width = 8, height = 5)
# Improved sentiment visualization with stacked bars by sentiment type
Plot <- ggplot(sentiment_scores, aes(x = Month, y = sentiment_score, fill = sentiment_score > 0)) +
geom_col(position = "stack") + # Stack positive and negative sentiments in each bar
facet_wrap(~ Business, ncol = 1) + # Facet by Business
scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick"),
labels = c("Positive", "Negative"),
limits = c("TRUE", "FALSE"), # Set order of legend items
name = "Sentiment") + # Custom legend title and labels
labs(title = "Sentiment Analysis by Business and Month",
x = NULL, y = "Sentiment Score") +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Sentiment Analysis by Business and Month 2", ".png", sep = ""), plot = Plot, width = 8, height = 5)
The number of comments is stratified by comment types and shown in a Pareto chart.
# Calculate the frequency and cumulative percentage for each Type
type_frequency <- TextData %>%
count(Type, sort = TRUE) %>%
mutate(cumulative_percentage = cumsum(n) / sum(n) * 100)
# Plot the Pareto chart
Plot <- ggplot(type_frequency, aes(x = reorder(Type, -n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") + # Bars for frequency
geom_line(aes(y = cumulative_percentage * max(type_frequency$n) / 100, group = 1), # Line for cumulative %
color = "red", size = 1) +
geom_point(aes(y = cumulative_percentage * max(type_frequency$n) / 100), color = "red") + # Points on the line
scale_y_continuous(
sec.axis = sec_axis(~ . / max(type_frequency$n) * 100, name = "Cumulative Percentage") # Secondary y-axis for %
) +
labs(title = "Pareto Chart of Comment Types",
x = NULL,
y = "Frequency") +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot
# Save the plot
ggsave(filename = paste("Pareto Chart of Comment Types", ".png", sep = ""), plot = Plot, width = 8, height = 5)
Purpose: To understand the key terms or themes in the dataset and their relevance across different categories. Explanation: The most frequently occurring words were identified and visualised. The most common words in comments are extracted by business and comment type and displayed in bar charts.
# Count word frequencies by Business, Month, and Type
word_freq <- tokens %>%
group_by(Business, Type, word) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
# Visualize top words by stratified groups
top_words <- word_freq %>%
group_by(Business, Type) %>%
ungroup()
# Filter for top 10 words across all types
top_10_words <- top_words %>%
group_by(Type) %>%
slice_max(order_by = freq, n = 10) %>% # Select the top 10 words by frequency within each Type
ungroup()
# Make the Graph
Plot <- ggplot(top_10_words, aes(x = reorder_within(word, freq, Type), y = freq, fill = Business)) +
geom_col(show.legend = TRUE) +
facet_wrap(~ Type, scales = "free", ncol = 2) +
coord_flip() +
labs(title = "Top Words by Business and Type",
x = "Words", y = "Frequency") +
scale_x_reordered() +
theme_minimal() +
theme(axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)) +
theme(
strip.text = element_text(size = 14, face = "bold", color = "blue") # Custom facet label styling
) +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), # Increase title size and make it bold
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5)
)
# Print the Plot
Plot