1 Load Libraries

Loading necessary libraries for data analysis and visualization ensures required packages are installed and available for use. Examples: rmarkdown for document generation, readxl for reading Excel files

# Loading necessary packages
# Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "utils" %in% installed.packages()) { install.packages("utils", dependencies = TRUE) }
library(utils)

# Decide whether PDF are saved
makePDF = FALSE

cat("Necessary Libraries loaded.")
## Necessary Libraries loaded.

2 Make Settings

# Global Settings
options(digits =   4)
options(scipen = 999)

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")

3 Make Function

Defining utility functions for data analysis.. Example: getmode() calculates the mode of a vector. Also loads the vtable package for generating descriptive statistics.

# Loading other packages
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Function to identify mode
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

4 Read Dataset

Reading a dataset (HeightShoe) from a specified URL ensures the data is ready for exploration and analysis.

5 Explore Dataset

# Add ID to Data Frame
HeightShoe$ID <- 1:nrow(HeightShoe)

cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(HeightShoe)
## [1] "Height" "Shoe"   "Gender" "ID"
cat("\n\nShow The Structure of the Data Frame:\n")
## 
## 
## Show The Structure of the Data Frame:
str(HeightShoe)
## 'data.frame':    15 obs. of  4 variables:
##  $ Height: int  186 170 160 152 175 165 165 183 164 160 ...
##  $ Shoe  : int  47 42 38 38 42 38 40 44 37 38 ...
##  $ Gender: chr  "Male" "Male" "Female" "Female" ...
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
##   Height Shoe Gender ID
## 1    186   47   Male  1
## 2    170   42   Male  2
## 3    160   38 Female  3
## 4    152   38 Female  4
## 5    175   42   Male  5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(HeightShoe, group = 'Gender', group.long = TRUE, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,    
title='Summary Statistics',
summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##         Variable N   Mean      SD Min  P25   P50   P75 Max NA Mode
## 1 Gender: Female                                                  
## 2         Height 9 160.22  4.8161 152  158   160   164 165  0  160
## 3           Shoe 9     38 0.86603  37   38    38    38  40  0   38
## 4             ID 9 8.6667   3.937   3    6     9    12  14  0    3
## 5                                                                 
## 6   Gender: Male                                                  
## 7         Height 6  175.5  7.3417 169  170 172.5   181 186  0  170
## 8           Shoe 6     43  2.1909  41   42    42  43.5  47  0   42
## 9             ID 6      7  5.4037   1 2.75   6.5 10.25  15  0    1
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)

# Generate a nice Summary
print(dfSummary(HeightShoe), method = 'render')

Data Frame Summary

HeightShoe

Dimensions: 15 x 4
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Height [integer]
Mean (sd) : 166.3 (9.6)
min ≤ med ≤ max:
152 ≤ 165 ≤ 186
IQR (CV) : 10 (0.1)
11 distinct values 15 (100.0%) 0 (0.0%)
2 Shoe [integer]
Mean (sd) : 40 (2.9)
min ≤ med ≤ max:
37 ≤ 38 ≤ 47
IQR (CV) : 4 (0.1)
37:2(13.3%)
38:6(40.0%)
40:1(6.7%)
41:1(6.7%)
42:3(20.0%)
44:1(6.7%)
47:1(6.7%)
15 (100.0%) 0 (0.0%)
3 Gender [character]
1. Female
2. Male
9(60.0%)
6(40.0%)
15 (100.0%) 0 (0.0%)
4 ID [integer]
Mean (sd) : 8 (4.5)
min ≤ med ≤ max:
1 ≤ 8 ≤ 15
IQR (CV) : 7 (0.6)
15 distinct values (Integer sequence) 15 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-19

# Write data to working directory
write.csv(HeightShoe, file = paste(today, "HeightShoe.csv"))

6 Run Analysis of ShoeSize vs Height

6.1 Run Regression for HeightShoe

  • Running a simple linear regression to predict shoe size based on height.
  • Adding regression residuals, fitted values, and standardized residuals to the dataset.
  • Displaying the regression model summary and ANOVA table.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Calculate Correlation Coefficient
ci_cor(
  x = HeightShoe$Height,
  y = HeightShoe$Shoe,
  probs = c(0.025, 0.975),
  method = "pearson",
)
## 
##  Two-sided 95% normal confidence interval for the true Pearson
##  correlation coefficient
## 
## Sample estimate: 0.9131 
## Confidence interval:
##   2.5%  97.5% 
## 0.7531 0.9711
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height, data = HeightShoe)

# Add Residuals to Data Frame
HeightShoe$Resid  <- lm1$residuals
HeightShoe$Fit    <- lm1$fitted.values
MeanRes    <- mean(HeightShoe$Resid)
SDRes      <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes

# Display results 
summary(lm1) 
## 
## Call:
## lm(formula = Shoe ~ Height, data = HeightShoe)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.352 -0.520 -0.019  0.704  1.984 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.2275     5.7339   -1.09      0.3    
## Height        0.2779     0.0344    8.07 0.000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 13 degrees of freedom
## Multiple R-squared:  0.834,  Adjusted R-squared:  0.821 
## F-statistic: 65.2 on 1 and 13 DF,  p-value: 0.00000202
anova(lm1)
## Analysis of Variance Table
## 
## Response: Shoe
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## Height     1  100.1   100.1    65.2 0.000002 ***
## Residuals 13   19.9     1.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6.2 Calculate Confidence Interval for Pearson’s Correlation Coefficient

Calculating the confidence interval for Pearson’s correlation coefficient provides a measure of the strength and reliability of the relationship between height and shoe size.

# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Calculate Correlation Coefficient
ci_cor(
  x = HeightShoe$Height,
  y = HeightShoe$Shoe,
  probs = c(0.025, 0.975),
  method = "pearson"
)
## 
##  Two-sided 95% normal confidence interval for the true Pearson
##  correlation coefficient
## 
## Sample estimate: 0.9131 
## Confidence interval:
##   2.5%  97.5% 
## 0.7531 0.9711

6.3 Plot Data with Regression Line

  • Creating a scatter plot to visualize the relationship between height and shoe size.
  • Overlaying the regression line to illustrate the model’s fit.
# Plot Data with Regression Line
plot(HeightShoe$Height, HeightShoe$Shoe, 
     main = "Scatter Plot of Shoe Size by Height", 
     xlab = "Height in cm", 
     ylab = "EU Shoe Size", 
     pch = 16, 
     col = "blue") 

abline(lm1, col = "red")
grid(col = "lightgrey", lty = "dotted")

6.4 Show High Leverage Points and Influential Observations

  • Calculating Cook’s Distance to identify influential data points.
  • Plotting leverage vs Cook’s distance to detect high-leverage and influential observations.
  • Highlighting observations with large Cook’s distance.
# Calculating Cook's Distance
cutoff <- 4/(nrow(HeightShoe)-length(lm1$coefficients)-2)

# Plot Cook's Distance for each observation
plot(lm1, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")

# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm1)
HeightShoe[which(abs(cooks) > cooks_crit),]
##   Height Shoe Gender ID Resid   Fit StdRes
## 1    186   47   Male  1 1.534 45.47  1.285
# Calculate standardised residuals
mse <- sum(lm1$residuals^2) / df.residual(lm1)
lm1$residuals / sqrt(mse*(1 - hatvalues(lm1)))
##       1       2       3       4       5       6       7       8       9      10 
##  1.5546  0.8243 -0.2038  1.8192 -0.3526 -1.3626  0.3099 -0.6017 -1.9694 -0.2038 
##      11      12      13      14      15 
##  1.0550  0.3822  0.2720 -1.1319 -0.0160

6.5 Check Whether Residuals Are Normal

  • Creating a Q-Q plot to assess the normality of residuals from the regression.
  • Performing a Shapiro-Wilk test to formally test residual normality.
  • Providing insights on whether model assumptions are met.
# Loading Package if Not Available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqnorm(lm1$residuals) 
qqline(lm1$residuals, col = 2)

# Test normality of residuals
cat("\n\n**Normal distribution indicator of Residuals:**")
## 
## 
## **Normal distribution indicator of Residuals:**
p.value <- shapiro.test(lm1$residuals)$p.value
if (p.value > 0.05) {
  cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE)) 
  cat("\nIf p-value > 0.05, data are normally distributed.")
} else {
  cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE)) 
  cat("\nIf p-value <= 0.05, data are not normally distributed.")
}
## 
## p-value:  0.9753
## If p-value > 0.05, data are normally distributed.

6.6 Plot All Residual Plots

Generating diagnostic plots to evaluate regression model assumptions. It includes residuals vs fitted values, Q-Q plot, residuals over time, and Cook’s distance vs leverage. This helps identify potential issues with the model.

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values",      ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")

# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Time (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")

# Reset plotting environment
par(mfrow = c(1, 1))

6.7 Calculate Prediction with Confidence and Prediction Interval

Making predictions for new height values with confidence and prediction intervals, including - Confidence interval: range for the mean prediction. - Prediction interval: range for individual predictions.

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Set x for prediction
cat("\n\nMake Prediction for New Height: 170, 176\n")
## 
## 
## Make Prediction for New Height: 170, 176
newHeight <- data.frame(Height = c(170, 176))

# Calculate Confidence Interval
cat("\n\nConfidence Intervals:\n")
## 
## 
## Confidence Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "confidence", level = 0.95)
##     fit   lwr   upr
## 1 41.02 40.28 41.76
## 2 42.69 41.69 43.68
# Calculate Prediction Interval
cat("\n\nPrediction Intervals:\n")
## 
## 
## Prediction Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "prediction", level = 0.95)
##     fit   lwr   upr
## 1 41.02 38.24 43.80
## 2 42.69 39.83 45.54

6.8 Plot Confidence and Prediction Interval

Creating a plot to visualize the regression line with confidence and prediction intervals uses quantile regression to calculate upper and lower bounds.

# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)

# 95% quantile, two tailed
rq_low  <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.025)  #lower quantile
rq_high <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.975)  #upper quantile

HeightShoe %>% 
  mutate(low  = rq_low$coefficients[1]  + HeightShoe$Height * rq_low$coefficients[2],
         high = rq_high$coefficients[1] + HeightShoe$Height * rq_high$coefficients[2]) %>% 
  ggplot() +
  geom_point(aes(HeightShoe$Height, HeightShoe$Shoe), size = 4) +
  geom_smooth(aes(HeightShoe$Height, HeightShoe$Shoe), method = "lm") +
  geom_line(aes(HeightShoe$Height, low),  linetype = "dashed", color = "red") +
  geom_line(aes(HeightShoe$Height, high), linetype = "dashed", color = "red") +
  theme_bw()

7 Run Analysis of ShoeSize vs Height and Gender_Male

7.1 Make Indicator Variable for Gender

Converting the Gender variable into dummy/indicator variables is necessary to facilitate the inclusion of categorical data in regression analysis. The modified dataset is written to the working directory.

# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE) }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE) }
library(data.table)

# Convert categorical variable to factors
HeightShoe$Gender <- as.factor(HeightShoe$Gender)
HeightShoe <- one_hot(as.data.table(HeightShoe))

cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
##    Height  Shoe Gender_Female Gender_Male    ID   Resid   Fit  StdRes
##     <int> <int>         <int>       <int> <int>   <num> <num>   <num>
## 1:    186    47             0           1     1  1.5342 45.47  1.2853
## 2:    170    42             0           1     2  0.9810 41.02  0.8218
## 3:    160    38             1           0     3 -0.2398 38.24 -0.2009
## 4:    152    38             1           0     4  1.9835 36.02  1.6617
## 5:    175    42             0           1     5 -0.4086 42.41 -0.3423
cat("\n\nList of Columns (Variables):\n")
## 
## 
## List of Columns (Variables):
names(HeightShoe)
## [1] "Height"        "Shoe"          "Gender_Female" "Gender_Male"  
## [5] "ID"            "Resid"         "Fit"           "StdRes"
# Write Data to WD
write.csv(HeightShoe, file = paste(today, "HeightShoeInd.csv"))

7.2 Run Regression for HeightShoe with Gender

  • Running a multiple regression to predict shoe size using height and gender.
  • Adding regression residuals, fitted values, and standardized residuals to the dataset.
  • Displaying the regression model summary and ANOVA table.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height + Gender_Male , data = HeightShoe)

# Add Residuals to Data Frame
HeightShoe$Resid  <- lm1$residuals
HeightShoe$Fit    <- lm1$fitted.values
MeanRes    <- mean(HeightShoe$Resid)
SDRes      <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes

# Display results 
summary(lm1) 
## 
## Call:
## lm(formula = Shoe ~ Height + Gender_Male, data = HeightShoe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7057 -0.7990  0.0415  0.3146  2.0387 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   8.0720     8.1562    0.99   0.3419   
## Height        0.1868     0.0509    3.67   0.0032 **
## Gender_Male   2.1463     0.9647    2.22   0.0460 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.08 on 12 degrees of freedom
## Multiple R-squared:  0.882,  Adjusted R-squared:  0.863 
## F-statistic:   45 on 2 and 12 DF,  p-value: 0.00000266
anova(lm1)
## Analysis of Variance Table
## 
## Response: Shoe
##             Df Sum Sq Mean Sq F value     Pr(>F)    
## Height       1  100.1   100.1   85.01 0.00000085 ***
## Gender_Male  1    5.8     5.8    4.95      0.046 *  
## Residuals   12   14.1     1.2                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.3 Plot All Residual Plots

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values",      ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")

# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Time (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")

# Reset plotting environment
par(mfrow = c(1, 1))

7.4 Prepare Data for Pie Chart

  • Preparing data for visualizing the sum of squares from the ANOVA table.
  • Includes calculating percentages and cumulative sums for better presentation.
# Get the ANOVA table
anova_table <- anova(lm1)

# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`

# Prepare data for the pie chart
SSdf <- data.frame(
  Component = rownames(anova_table),
  SumSq = SumSq
)

# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]

# Summarize the last three rows
# summary_row <- SSdf %>%
#   slice_tail(n = 2) %>%
#   summarise(Component = "Residual", SumSq = sum(SumSq))

# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]

# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)

# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq) 

# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")

# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)

SSdf
##     Component   SumSq Percent                 Labels
## 1      Height 100.051 83.38 %   Height\n100\n83.38 %
## 2   Residuals  14.123 11.77 % Residuals\n14\n11.77 %
## 3 Gender_Male   5.826  4.85 % Gender_Male\n6\n4.85 %

7.5 Show Pie Chart of Sum of Squares for Regression Model

  • Creating a pie chart to illustrate the contribution of each regression component to the total sum of squares.
  • Enhances understanding of model variance attribution.
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Cereal Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")