Loading necessary libraries for data analysis and visualization ensures required packages are installed and available for use. Examples: rmarkdown for document generation, readxl for reading Excel files
# Loading necessary packages
# Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
# Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "utils" %in% installed.packages()) { install.packages("utils", dependencies = TRUE) }
library(utils)
# Decide whether PDF are saved
makePDF = FALSE
cat("Necessary Libraries loaded.")
## Necessary Libraries loaded.
# Global Settings
options(digits = 4)
options(scipen = 999)
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")
Defining utility functions for data analysis.. Example: getmode()
calculates the mode of a vector. Also loads the vtable
package for generating descriptive statistics.
# Loading other packages
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Function to identify mode
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Reading a dataset (HeightShoe) from a specified URL ensures the data is ready for exploration and analysis.
# Add ID to Data Frame
HeightShoe$ID <- 1:nrow(HeightShoe)
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(HeightShoe)
## [1] "Height" "Shoe" "Gender" "ID"
cat("\n\nShow The Structure of the Data Frame:\n")
##
##
## Show The Structure of the Data Frame:
str(HeightShoe)
## 'data.frame': 15 obs. of 4 variables:
## $ Height: int 186 170 160 152 175 165 165 183 164 160 ...
## $ Shoe : int 47 42 38 38 42 38 40 44 37 38 ...
## $ Gender: chr "Male" "Male" "Female" "Female" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
## Height Shoe Gender ID
## 1 186 47 Male 1
## 2 170 42 Male 2
## 3 160 38 Female 3
## 4 152 38 Female 4
## 5 175 42 Male 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(HeightShoe, group = 'Gender', group.long = TRUE, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 Gender: Female
## 2 Height 9 160.22 4.8161 152 158 160 164 165 0 160
## 3 Shoe 9 38 0.86603 37 38 38 38 40 0 38
## 4 ID 9 8.6667 3.937 3 6 9 12 14 0 3
## 5
## 6 Gender: Male
## 7 Height 6 175.5 7.3417 169 170 172.5 181 186 0 170
## 8 Shoe 6 43 2.1909 41 42 42 43.5 47 0 42
## 9 ID 6 7 5.4037 1 2.75 6.5 10.25 15 0 1
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Generate a nice Summary
print(dfSummary(HeightShoe), method = 'render')
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | ||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Height [integer] |
|
11 distinct values | 15 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||
2 | Shoe [integer] |
|
|
15 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||
3 | Gender [character] |
|
|
15 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||
4 | ID [integer] |
|
15 distinct values (Integer sequence) | 15 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-19
# Write data to working directory
write.csv(HeightShoe, file = paste(today, "HeightShoe.csv"))
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Calculate Correlation Coefficient
ci_cor(
x = HeightShoe$Height,
y = HeightShoe$Shoe,
probs = c(0.025, 0.975),
method = "pearson",
)
##
## Two-sided 95% normal confidence interval for the true Pearson
## correlation coefficient
##
## Sample estimate: 0.9131
## Confidence interval:
## 2.5% 97.5%
## 0.7531 0.9711
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height, data = HeightShoe)
# Add Residuals to Data Frame
HeightShoe$Resid <- lm1$residuals
HeightShoe$Fit <- lm1$fitted.values
MeanRes <- mean(HeightShoe$Resid)
SDRes <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes
# Display results
summary(lm1)
##
## Call:
## lm(formula = Shoe ~ Height, data = HeightShoe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.352 -0.520 -0.019 0.704 1.984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.2275 5.7339 -1.09 0.3
## Height 0.2779 0.0344 8.07 0.000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 13 degrees of freedom
## Multiple R-squared: 0.834, Adjusted R-squared: 0.821
## F-statistic: 65.2 on 1 and 13 DF, p-value: 0.00000202
anova(lm1)
## Analysis of Variance Table
##
## Response: Shoe
## Df Sum Sq Mean Sq F value Pr(>F)
## Height 1 100.1 100.1 65.2 0.000002 ***
## Residuals 13 19.9 1.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculating the confidence interval for Pearson’s correlation coefficient provides a measure of the strength and reliability of the relationship between height and shoe size.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Calculate Correlation Coefficient
ci_cor(
x = HeightShoe$Height,
y = HeightShoe$Shoe,
probs = c(0.025, 0.975),
method = "pearson"
)
##
## Two-sided 95% normal confidence interval for the true Pearson
## correlation coefficient
##
## Sample estimate: 0.9131
## Confidence interval:
## 2.5% 97.5%
## 0.7531 0.9711
# Plot Data with Regression Line
plot(HeightShoe$Height, HeightShoe$Shoe,
main = "Scatter Plot of Shoe Size by Height",
xlab = "Height in cm",
ylab = "EU Shoe Size",
pch = 16,
col = "blue")
abline(lm1, col = "red")
grid(col = "lightgrey", lty = "dotted")
# Calculating Cook's Distance
cutoff <- 4/(nrow(HeightShoe)-length(lm1$coefficients)-2)
# Plot Cook's Distance for each observation
plot(lm1, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")
# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm1)
HeightShoe[which(abs(cooks) > cooks_crit),]
## Height Shoe Gender ID Resid Fit StdRes
## 1 186 47 Male 1 1.534 45.47 1.285
# Calculate standardised residuals
mse <- sum(lm1$residuals^2) / df.residual(lm1)
lm1$residuals / sqrt(mse*(1 - hatvalues(lm1)))
## 1 2 3 4 5 6 7 8 9 10
## 1.5546 0.8243 -0.2038 1.8192 -0.3526 -1.3626 0.3099 -0.6017 -1.9694 -0.2038
## 11 12 13 14 15
## 1.0550 0.3822 0.2720 -1.1319 -0.0160
# Loading Package if Not Available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqnorm(lm1$residuals)
qqline(lm1$residuals, col = 2)
# Test normality of residuals
cat("\n\n**Normal distribution indicator of Residuals:**")
##
##
## **Normal distribution indicator of Residuals:**
p.value <- shapiro.test(lm1$residuals)$p.value
if (p.value > 0.05) {
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
cat("\nIf p-value > 0.05, data are normally distributed.")
} else {
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
cat("\nIf p-value <= 0.05, data are not normally distributed.")
}
##
## p-value: 0.9753
## If p-value > 0.05, data are normally distributed.
Generating diagnostic plots to evaluate regression model assumptions. It includes residuals vs fitted values, Q-Q plot, residuals over time, and Cook’s distance vs leverage. This helps identify potential issues with the model.
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values", ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")
# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Time (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")
# Reset plotting environment
par(mfrow = c(1, 1))
Making predictions for new height values with confidence and prediction intervals, including - Confidence interval: range for the mean prediction. - Prediction interval: range for individual predictions.
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Set x for prediction
cat("\n\nMake Prediction for New Height: 170, 176\n")
##
##
## Make Prediction for New Height: 170, 176
newHeight <- data.frame(Height = c(170, 176))
# Calculate Confidence Interval
cat("\n\nConfidence Intervals:\n")
##
##
## Confidence Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "confidence", level = 0.95)
## fit lwr upr
## 1 41.02 40.28 41.76
## 2 42.69 41.69 43.68
# Calculate Prediction Interval
cat("\n\nPrediction Intervals:\n")
##
##
## Prediction Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 41.02 38.24 43.80
## 2 42.69 39.83 45.54
Creating a plot to visualize the regression line with confidence and prediction intervals uses quantile regression to calculate upper and lower bounds.
# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)
# 95% quantile, two tailed
rq_low <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.025) #lower quantile
rq_high <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.975) #upper quantile
HeightShoe %>%
mutate(low = rq_low$coefficients[1] + HeightShoe$Height * rq_low$coefficients[2],
high = rq_high$coefficients[1] + HeightShoe$Height * rq_high$coefficients[2]) %>%
ggplot() +
geom_point(aes(HeightShoe$Height, HeightShoe$Shoe), size = 4) +
geom_smooth(aes(HeightShoe$Height, HeightShoe$Shoe), method = "lm") +
geom_line(aes(HeightShoe$Height, low), linetype = "dashed", color = "red") +
geom_line(aes(HeightShoe$Height, high), linetype = "dashed", color = "red") +
theme_bw()
Converting the Gender variable into dummy/indicator variables is necessary to facilitate the inclusion of categorical data in regression analysis. The modified dataset is written to the working directory.
# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE) }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE) }
library(data.table)
# Convert categorical variable to factors
HeightShoe$Gender <- as.factor(HeightShoe$Gender)
HeightShoe <- one_hot(as.data.table(HeightShoe))
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
## Height Shoe Gender_Female Gender_Male ID Resid Fit StdRes
## <int> <int> <int> <int> <int> <num> <num> <num>
## 1: 186 47 0 1 1 1.5342 45.47 1.2853
## 2: 170 42 0 1 2 0.9810 41.02 0.8218
## 3: 160 38 1 0 3 -0.2398 38.24 -0.2009
## 4: 152 38 1 0 4 1.9835 36.02 1.6617
## 5: 175 42 0 1 5 -0.4086 42.41 -0.3423
cat("\n\nList of Columns (Variables):\n")
##
##
## List of Columns (Variables):
names(HeightShoe)
## [1] "Height" "Shoe" "Gender_Female" "Gender_Male"
## [5] "ID" "Resid" "Fit" "StdRes"
# Write Data to WD
write.csv(HeightShoe, file = paste(today, "HeightShoeInd.csv"))
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height + Gender_Male , data = HeightShoe)
# Add Residuals to Data Frame
HeightShoe$Resid <- lm1$residuals
HeightShoe$Fit <- lm1$fitted.values
MeanRes <- mean(HeightShoe$Resid)
SDRes <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes
# Display results
summary(lm1)
##
## Call:
## lm(formula = Shoe ~ Height + Gender_Male, data = HeightShoe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7057 -0.7990 0.0415 0.3146 2.0387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.0720 8.1562 0.99 0.3419
## Height 0.1868 0.0509 3.67 0.0032 **
## Gender_Male 2.1463 0.9647 2.22 0.0460 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.08 on 12 degrees of freedom
## Multiple R-squared: 0.882, Adjusted R-squared: 0.863
## F-statistic: 45 on 2 and 12 DF, p-value: 0.00000266
anova(lm1)
## Analysis of Variance Table
##
## Response: Shoe
## Df Sum Sq Mean Sq F value Pr(>F)
## Height 1 100.1 100.1 85.01 0.00000085 ***
## Gender_Male 1 5.8 5.8 4.95 0.046 *
## Residuals 12 14.1 1.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values", ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")
# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Time (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")
# Reset plotting environment
par(mfrow = c(1, 1))
# Get the ANOVA table
anova_table <- anova(lm1)
# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`
# Prepare data for the pie chart
SSdf <- data.frame(
Component = rownames(anova_table),
SumSq = SumSq
)
# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]
# Summarize the last three rows
# summary_row <- SSdf %>%
# slice_tail(n = 2) %>%
# summarise(Component = "Residual", SumSq = sum(SumSq))
# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]
# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)
# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq)
# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")
# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)
SSdf
## Component SumSq Percent Labels
## 1 Height 100.051 83.38 % Height\n100\n83.38 %
## 2 Residuals 14.123 11.77 % Residuals\n14\n11.77 %
## 3 Gender_Male 5.826 4.85 % Gender_Male\n6\n4.85 %
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")