1 Load Libraries

Loading necessary libraries for data analysis and visualization ensures required packages are installed and available for use. Examples: rmarkdown for document generation, readxl for reading Excel files

# Loading necessary packages
# Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "utils" %in% installed.packages()) { install.packages("utils", dependencies = TRUE) }
library(utils)

# Decide whether PDF are saved
makePDF = FALSE

cat("Necessary Libraries loaded.")
## Necessary Libraries loaded.

2 Make Settings

# Global Settings
options(digits =   4)
options(scipen = 999)

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")

3 Make Function

Defining utility functions for data analysis.. Example: getmode() calculates the mode of a vector. Also loads the vtable package for generating descriptive statistics.

# Loading other packages
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Function to identify mode
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

4 Read Dataset

Reading a dataset (HeightShoe) from a specified URL ensures the data is ready for exploration and analysis.

5 Explore Dataset

# Add ID to Data Frame
HeightShoe$ID <- 1:nrow(HeightShoe)

cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(HeightShoe)
## [1] "Height" "Gender" "Shoe"   "ID"
cat("\n\nShow The Structure of the Data Frame:\n")
## 
## 
## Show The Structure of the Data Frame:
str(HeightShoe)
## 'data.frame':    40 obs. of  4 variables:
##  $ Height: num  158 170 178 166 170 ...
##  $ Gender: chr  "Female" "Female" "Male" "Female" ...
##  $ Shoe  : num  34.6 38.4 42.8 38.2 37.4 35.7 42.6 42.6 46.8 36.1 ...
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
##   Height Gender Shoe ID
## 1  157.5 Female 34.6  1
## 2  170.2 Female 38.4  2
## 3  178.1   Male 42.8  3
## 4  165.6 Female 38.2  4
## 5  170.4 Female 37.4  5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(HeightShoe, group = 'Gender', group.long = TRUE, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,    
title='Summary Statistics',
summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##         Variable  N   Mean     SD   Min    P25    P50    P75   Max NA  Mode
## 1 Gender: Female                                                           
## 2         Height 18 165.54 4.5813 157.5    164  166.6  169.2   171  0 157.5
## 3           Shoe 18 36.861 1.4411  34.6 35.875  36.85 38.175  38.9  0  36.1
## 4             ID 18 18.167 12.373     1      7   17.5  24.75    40  0     1
## 5                                                                          
## 6   Gender: Male                                                           
## 7         Height 22 177.98 5.1818 168.9  174.3 177.75 182.22 186.4  0 184.7
## 8           Shoe 22 43.377 1.7886  40.5   42.3  43.25  44.25  47.5  0  42.6
## 9             ID 22 22.409 11.018     3  12.25   24.5  31.75    38  0     3
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)

# Generate a nice Summary
print(dfSummary(HeightShoe), method = 'render')

Data Frame Summary

HeightShoe

Dimensions: 40 x 4
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Height [numeric]
Mean (sd) : 172.4 (7.9)
min ≤ med ≤ max:
157.5 ≤ 171.9 ≤ 186.4
IQR (CV) : 10.5 (0)
39 distinct values 40 (100.0%) 0 (0.0%)
2 Gender [character]
1. Female
2. Male
18(45.0%)
22(55.0%)
40 (100.0%) 0 (0.0%)
3 Shoe [numeric]
Mean (sd) : 40.4 (3.7)
min ≤ med ≤ max:
34.6 ≤ 41.1 ≤ 47.5
IQR (CV) : 6 (0.1)
34 distinct values 40 (100.0%) 0 (0.0%)
4 ID [integer]
Mean (sd) : 20.5 (11.7)
min ≤ med ≤ max:
1 ≤ 20.5 ≤ 40
IQR (CV) : 19.5 (0.6)
40 distinct values (Integer sequence) 40 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.4.1)
2024-11-20

# Write data to working directory
write.csv(HeightShoe, file = paste(today, "HeightShoe.csv"))

6 Run Analysis of ShoeSize vs Height

6.1 Run Regression for HeightShoe

  • Running a simple linear regression to predict shoe size based on height.
  • Adding regression residuals, fitted values, and standardized residuals to the dataset.
  • Displaying the regression model summary and ANOVA table.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Calculate Correlation Coefficient
ci_cor(
  x = HeightShoe$Height,
  y = HeightShoe$Shoe,
  probs = c(0.025, 0.975),
  method = "pearson",
)
## 
##  Two-sided 95% normal confidence interval for the true Pearson
##  correlation coefficient
## 
## Sample estimate: 0.9181 
## Confidence interval:
##   2.5%  97.5% 
## 0.8495 0.9561
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height, data = HeightShoe)

# Add Residuals to Data Frame
HeightShoe$Resid  <- lm1$residuals
HeightShoe$Fit    <- lm1$fitted.values
MeanRes    <- mean(HeightShoe$Resid)
SDRes      <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes

# Display results 
summary(lm1) 
## 
## Call:
## lm(formula = Shoe ~ Height, data = HeightShoe)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.717 -0.775 -0.094  1.174  2.755 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -32.6322     5.1248   -6.37           0.00000018 ***
## Height        0.4239     0.0297   14.27 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.47 on 38 degrees of freedom
## Multiple R-squared:  0.843,  Adjusted R-squared:  0.839 
## F-statistic:  204 on 1 and 38 DF,  p-value: <0.0000000000000002
anova(lm1)
## Analysis of Variance Table
## 
## Response: Shoe
##           Df Sum Sq Mean Sq F value              Pr(>F)    
## Height     1    441     441     204 <0.0000000000000002 ***
## Residuals 38     82       2                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6.2 Calculate Confidence Interval for Pearson’s Correlation Coefficient

Calculating the confidence interval for Pearson’s correlation coefficient provides a measure of the strength and reliability of the relationship between height and shoe size.

# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Calculate Correlation Coefficient
ci_cor(
  x = HeightShoe$Height,
  y = HeightShoe$Shoe,
  probs = c(0.025, 0.975),
  method = "pearson"
)
## 
##  Two-sided 95% normal confidence interval for the true Pearson
##  correlation coefficient
## 
## Sample estimate: 0.9181 
## Confidence interval:
##   2.5%  97.5% 
## 0.8495 0.9561

6.3 Plot Data with Regression Line

  • Creating a scatter plot to visualize the relationship between height and shoe size.
  • Overlaying the regression line to illustrate the model’s fit.
# Plot Data with Regression Line
plot(HeightShoe$Height, HeightShoe$Shoe, 
     main = "Scatter Plot of Shoe Size by Height", 
     xlab = "Height in cm", 
     ylab = "EU Shoe Size", 
     pch = 16, 
     col = "blue") 

abline(lm1, col = "red")
grid(col = "lightgrey", lty = "dotted")

6.4 Show High Leverage Points and Influential Observations

  • Calculating Cook’s Distance to identify influential data points.
  • Plotting leverage vs Cook’s distance to detect high-leverage and influential observations.
  • Highlighting observations with large Cook’s distance.
# Calculating Cook's Distance
cutoff <- 4/(nrow(HeightShoe)-length(lm1$coefficients)-2)

# Plot Cook's Distance for each observation
plot(lm1, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")

# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm1)
HeightShoe[which(abs(cooks) > cooks_crit),]
## [1] Height Gender Shoe   ID     Resid  Fit    StdRes
## <0 rows> (or 0-length row.names)
# Calculate standardised residuals
mse <- sum(lm1$residuals^2) / df.residual(lm1)
lm1$residuals / sqrt(mse*(1 - hatvalues(lm1)))
##        1        2        3        4        5        6        7        8 
##  0.33620 -0.77120 -0.04694  0.43899 -1.51872  1.10007 -0.21507 -0.95732 
##        9       10       11       12       13       14       15       16 
##  0.35792 -0.49363 -0.26431 -0.04665 -1.22141 -2.57178 -0.44753 -0.10917 
##       17       18       19       20       21       22       23       24 
## -0.02039 -0.60688  1.34425 -1.43354  1.89815  1.23051 -0.42909 -0.44978 
##       25       26       27       28       29       30       31       32 
## -0.51146 -0.08303  1.08748  1.25190 -0.26013  0.62478 -1.06746  0.15367 
##       33       34       35       36       37       38       39       40 
##  1.84697  0.80061  1.05796  0.40929 -0.65991  0.93489 -1.76587  1.13803

6.5 Check Whether Residuals Are Normal

  • Creating a Q-Q plot to assess the normality of residuals from the regression.
  • Performing a Shapiro-Wilk test to formally test residual normality.
  • Providing insights on whether model assumptions are met.
# Loading Package if Not Available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqnorm(lm1$residuals) 
qqline(lm1$residuals, col = 2)

# Test normality of residuals
cat("\n\n**Normal distribution indicator of Residuals:**")
## 
## 
## **Normal distribution indicator of Residuals:**
p.value <- shapiro.test(lm1$residuals)$p.value
if (p.value > 0.05) {
  cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE)) 
  cat("\nIf p-value > 0.05, data are normally distributed.")
} else {
  cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE)) 
  cat("\nIf p-value <= 0.05, data are not normally distributed.")
}
## 
## p-value:  0.8266
## If p-value > 0.05, data are normally distributed.

6.6 Plot All Residual Plots

Generating diagnostic plots to evaluate regression model assumptions. It includes residuals vs fitted values, Q-Q plot, residuals over time, and Cook’s distance vs leverage. This helps identify potential issues with the model.

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values",      ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")

# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Time (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")

# Reset plotting environment
par(mfrow = c(1, 1))

6.7 Calculate Prediction with Confidence and Prediction Interval

Making predictions for new height values with confidence and prediction intervals, including - Confidence interval: range for the mean prediction. - Prediction interval: range for individual predictions.

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Set x for prediction
cat("\n\nMake Prediction for New Height: 170, 176\n")
## 
## 
## Make Prediction for New Height: 170, 176
newHeight <- data.frame(Height = c(170, 176))

# Calculate Confidence Interval
cat("\n\nConfidence Intervals:\n")
## 
## 
## Confidence Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "confidence", level = 0.95)
##     fit   lwr   upr
## 1 39.43 38.94 39.93
## 2 41.98 41.46 42.50
# Calculate Prediction Interval
cat("\n\nPrediction Intervals:\n")
## 
## 
## Prediction Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "prediction", level = 0.95)
##     fit   lwr   upr
## 1 39.43 36.42 42.45
## 2 41.98 38.96 45.00

6.8 Plot Confidence and Prediction Interval

Creating a plot to visualize the regression line with confidence and prediction intervals uses quantile regression to calculate upper and lower bounds.

# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)

# 95% quantile, two tailed
rq_low  <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.025)  #lower quantile
rq_high <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.975)  #upper quantile

HeightShoe %>% 
  mutate(low  = rq_low$coefficients[1]  + HeightShoe$Height * rq_low$coefficients[2],
         high = rq_high$coefficients[1] + HeightShoe$Height * rq_high$coefficients[2]) %>% 
  ggplot() +
  geom_point(aes(HeightShoe$Height, HeightShoe$Shoe), size = 4) +
  geom_smooth(aes(HeightShoe$Height, HeightShoe$Shoe), method = "lm") +
  geom_line(aes(HeightShoe$Height, low),  linetype = "dashed", color = "red") +
  geom_line(aes(HeightShoe$Height, high), linetype = "dashed", color = "red") +
  theme_bw()

7 Run Analysis of ShoeSize vs Height and Gender_Male

7.1 Make Indicator Variable for Gender

Converting the Gender variable into dummy/indicator variables is necessary to facilitate the inclusion of categorical data in regression analysis. The modified dataset is written to the working directory.

# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE) }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE) }
library(data.table)

# Convert categorical variable to factors
HeightShoe$Gender <- as.factor(HeightShoe$Gender)
HeightShoe <- one_hot(as.data.table(HeightShoe))

cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
##    Height Gender_Female Gender_Male  Shoe    ID    Resid   Fit   StdRes
##     <num>         <int>       <int> <num> <int>    <num> <num>    <num>
## 1:  157.5             1           0  34.6     1  0.46503 34.13  0.32035
## 2:  170.2             1           0  38.4     2 -1.11874 39.52 -0.77068
## 3:  178.1             0           1  42.8     3 -0.06769 42.87 -0.04663
## 4:  165.6             1           0  38.2     4  0.63129 37.57  0.43488
## 5:  170.4             1           0  37.4     5 -2.20352 39.60 -1.51797
cat("\n\nList of Columns (Variables):\n")
## 
## 
## List of Columns (Variables):
names(HeightShoe)
## [1] "Height"        "Gender_Female" "Gender_Male"   "Shoe"         
## [5] "ID"            "Resid"         "Fit"           "StdRes"
# Write Data to WD
write.csv(HeightShoe, file = paste(today, "HeightShoeInd.csv"))

7.2 Run Regression for HeightShoe with Gender

  • Running a multiple regression to predict shoe size using height and gender.
  • Adding regression residuals, fitted values, and standardized residuals to the dataset.
  • Displaying the regression model summary and ANOVA table.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height + Gender_Male , data = HeightShoe)

# Add Residuals to Data Frame
HeightShoe$Resid  <- lm1$residuals
HeightShoe$Fit    <- lm1$fitted.values
MeanRes    <- mean(HeightShoe$Resid)
SDRes      <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes

# Display results 
summary(lm1) 
## 
## Call:
## lm(formula = Shoe ~ Height + Gender_Male, data = HeightShoe)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.691 -0.670  0.175  0.649  1.954 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  -5.7805     5.7761   -1.00         0.32    
## Height        0.2576     0.0349    7.39 0.0000000086 ***
## Gender_Male   3.3125     0.5486    6.04 0.0000005579 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.06 on 37 degrees of freedom
## Multiple R-squared:  0.921,  Adjusted R-squared:  0.917 
## F-statistic:  215 on 2 and 37 DF,  p-value: <0.0000000000000002
anova(lm1)
## Analysis of Variance Table
## 
## Response: Shoe
##             Df Sum Sq Mean Sq F value               Pr(>F)    
## Height       1    441     441   393.9 < 0.0000000000000002 ***
## Gender_Male  1     41      41    36.5           0.00000056 ***
## Residuals   37     41       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.3 Plot All Residual Plots

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values",      ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")

# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Time (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")

# Reset plotting environment
par(mfrow = c(1, 1))

7.4 Prepare Data for Pie Chart

  • Preparing data for visualizing the sum of squares from the ANOVA table.
  • Includes calculating percentages and cumulative sums for better presentation.
# Get the ANOVA table
anova_table <- anova(lm1)

# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`

# Prepare data for the pie chart
SSdf <- data.frame(
  Component = rownames(anova_table),
  SumSq = SumSq
)

# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]

# Summarize the last three rows
# summary_row <- SSdf %>%
#   slice_tail(n = 2) %>%
#   summarise(Component = "Residual", SumSq = sum(SumSq))

# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]

# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)

# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq) 

# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")

# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)

SSdf
##     Component  SumSq Percent                 Labels
## 1      Height 440.66 84.28 %   Height\n441\n84.28 %
## 2   Residuals  41.39  7.92 %  Residuals\n41\n7.92 %
## 3 Gender_Male  40.79   7.8 % Gender_Male\n41\n7.8 %

7.5 Show Pie Chart of Sum of Squares for Regression Model

  • Creating a pie chart to illustrate the contribution of each regression component to the total sum of squares.
  • Enhances understanding of model variance attribution.
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Cereal Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")