Loading necessary libraries for data analysis and visualization ensures required packages are installed and available for use. Examples: rmarkdown for document generation, readxl for reading Excel files
# Loading necessary packages
# Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
# Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "utils" %in% installed.packages()) { install.packages("utils", dependencies = TRUE) }
library(utils)
# Decide whether PDF are saved
makePDF = FALSE
cat("Necessary Libraries loaded.")
## Necessary Libraries loaded.
# Global Settings
options(digits = 4)
options(scipen = 999)
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")
Defining utility functions for data analysis.. Example: getmode()
calculates the mode of a vector. Also loads the vtable
package for generating descriptive statistics.
# Loading other packages
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Function to identify mode
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Reading a dataset (HeightShoe) from a specified URL ensures the data is ready for exploration and analysis.
# Add ID to Data Frame
HeightShoe$ID <- 1:nrow(HeightShoe)
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(HeightShoe)
## [1] "Height" "Gender" "Shoe" "ID"
cat("\n\nShow The Structure of the Data Frame:\n")
##
##
## Show The Structure of the Data Frame:
str(HeightShoe)
## 'data.frame': 40 obs. of 4 variables:
## $ Height: num 158 170 178 166 170 ...
## $ Gender: chr "Female" "Female" "Male" "Female" ...
## $ Shoe : num 34.6 38.4 42.8 38.2 37.4 35.7 42.6 42.6 46.8 36.1 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
## Height Gender Shoe ID
## 1 157.5 Female 34.6 1
## 2 170.2 Female 38.4 2
## 3 178.1 Male 42.8 3
## 4 165.6 Female 38.2 4
## 5 170.4 Female 37.4 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(HeightShoe, group = 'Gender', group.long = TRUE, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 Gender: Female
## 2 Height 18 165.54 4.5813 157.5 164 166.6 169.2 171 0 157.5
## 3 Shoe 18 36.861 1.4411 34.6 35.875 36.85 38.175 38.9 0 36.1
## 4 ID 18 18.167 12.373 1 7 17.5 24.75 40 0 1
## 5
## 6 Gender: Male
## 7 Height 22 177.98 5.1818 168.9 174.3 177.75 182.22 186.4 0 184.7
## 8 Shoe 22 43.377 1.7886 40.5 42.3 43.25 44.25 47.5 0 42.6
## 9 ID 22 22.409 11.018 3 12.25 24.5 31.75 38 0 3
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Generate a nice Summary
print(dfSummary(HeightShoe), method = 'render')
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Height [numeric] |
|
39 distinct values | 40 (100.0%) | 0 (0.0%) | |||||||||||
2 | Gender [character] |
|
|
40 (100.0%) | 0 (0.0%) | |||||||||||
3 | Shoe [numeric] |
|
34 distinct values | 40 (100.0%) | 0 (0.0%) | |||||||||||
4 | ID [integer] |
|
40 distinct values (Integer sequence) | 40 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2024-11-20
# Write data to working directory
write.csv(HeightShoe, file = paste(today, "HeightShoe.csv"))
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Calculate Correlation Coefficient
ci_cor(
x = HeightShoe$Height,
y = HeightShoe$Shoe,
probs = c(0.025, 0.975),
method = "pearson",
)
##
## Two-sided 95% normal confidence interval for the true Pearson
## correlation coefficient
##
## Sample estimate: 0.9181
## Confidence interval:
## 2.5% 97.5%
## 0.8495 0.9561
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height, data = HeightShoe)
# Add Residuals to Data Frame
HeightShoe$Resid <- lm1$residuals
HeightShoe$Fit <- lm1$fitted.values
MeanRes <- mean(HeightShoe$Resid)
SDRes <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes
# Display results
summary(lm1)
##
## Call:
## lm(formula = Shoe ~ Height, data = HeightShoe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.717 -0.775 -0.094 1.174 2.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.6322 5.1248 -6.37 0.00000018 ***
## Height 0.4239 0.0297 14.27 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.47 on 38 degrees of freedom
## Multiple R-squared: 0.843, Adjusted R-squared: 0.839
## F-statistic: 204 on 1 and 38 DF, p-value: <0.0000000000000002
anova(lm1)
## Analysis of Variance Table
##
## Response: Shoe
## Df Sum Sq Mean Sq F value Pr(>F)
## Height 1 441 441 204 <0.0000000000000002 ***
## Residuals 38 82 2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculating the confidence interval for Pearson’s correlation coefficient provides a measure of the strength and reliability of the relationship between height and shoe size.
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Calculate Correlation Coefficient
ci_cor(
x = HeightShoe$Height,
y = HeightShoe$Shoe,
probs = c(0.025, 0.975),
method = "pearson"
)
##
## Two-sided 95% normal confidence interval for the true Pearson
## correlation coefficient
##
## Sample estimate: 0.9181
## Confidence interval:
## 2.5% 97.5%
## 0.8495 0.9561
# Plot Data with Regression Line
plot(HeightShoe$Height, HeightShoe$Shoe,
main = "Scatter Plot of Shoe Size by Height",
xlab = "Height in cm",
ylab = "EU Shoe Size",
pch = 16,
col = "blue")
abline(lm1, col = "red")
grid(col = "lightgrey", lty = "dotted")
# Calculating Cook's Distance
cutoff <- 4/(nrow(HeightShoe)-length(lm1$coefficients)-2)
# Plot Cook's Distance for each observation
plot(lm1, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")
# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm1)
HeightShoe[which(abs(cooks) > cooks_crit),]
## [1] Height Gender Shoe ID Resid Fit StdRes
## <0 rows> (or 0-length row.names)
# Calculate standardised residuals
mse <- sum(lm1$residuals^2) / df.residual(lm1)
lm1$residuals / sqrt(mse*(1 - hatvalues(lm1)))
## 1 2 3 4 5 6 7 8
## 0.33620 -0.77120 -0.04694 0.43899 -1.51872 1.10007 -0.21507 -0.95732
## 9 10 11 12 13 14 15 16
## 0.35792 -0.49363 -0.26431 -0.04665 -1.22141 -2.57178 -0.44753 -0.10917
## 17 18 19 20 21 22 23 24
## -0.02039 -0.60688 1.34425 -1.43354 1.89815 1.23051 -0.42909 -0.44978
## 25 26 27 28 29 30 31 32
## -0.51146 -0.08303 1.08748 1.25190 -0.26013 0.62478 -1.06746 0.15367
## 33 34 35 36 37 38 39 40
## 1.84697 0.80061 1.05796 0.40929 -0.65991 0.93489 -1.76587 1.13803
# Loading Package if Not Available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqnorm(lm1$residuals)
qqline(lm1$residuals, col = 2)
# Test normality of residuals
cat("\n\n**Normal distribution indicator of Residuals:**")
##
##
## **Normal distribution indicator of Residuals:**
p.value <- shapiro.test(lm1$residuals)$p.value
if (p.value > 0.05) {
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
cat("\nIf p-value > 0.05, data are normally distributed.")
} else {
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
cat("\nIf p-value <= 0.05, data are not normally distributed.")
}
##
## p-value: 0.8266
## If p-value > 0.05, data are normally distributed.
Generating diagnostic plots to evaluate regression model assumptions. It includes residuals vs fitted values, Q-Q plot, residuals over time, and Cook’s distance vs leverage. This helps identify potential issues with the model.
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values", ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")
# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Time (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")
# Reset plotting environment
par(mfrow = c(1, 1))
Making predictions for new height values with confidence and prediction intervals, including - Confidence interval: range for the mean prediction. - Prediction interval: range for individual predictions.
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Set x for prediction
cat("\n\nMake Prediction for New Height: 170, 176\n")
##
##
## Make Prediction for New Height: 170, 176
newHeight <- data.frame(Height = c(170, 176))
# Calculate Confidence Interval
cat("\n\nConfidence Intervals:\n")
##
##
## Confidence Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "confidence", level = 0.95)
## fit lwr upr
## 1 39.43 38.94 39.93
## 2 41.98 41.46 42.50
# Calculate Prediction Interval
cat("\n\nPrediction Intervals:\n")
##
##
## Prediction Intervals:
predict(lm1, newdata = newHeight, se.fit = FALSE, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 39.43 36.42 42.45
## 2 41.98 38.96 45.00
Creating a plot to visualize the regression line with confidence and prediction intervals uses quantile regression to calculate upper and lower bounds.
# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)
# 95% quantile, two tailed
rq_low <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.025) #lower quantile
rq_high <- rq(HeightShoe$Shoe ~ HeightShoe$Height, data = HeightShoe, tau = 0.975) #upper quantile
HeightShoe %>%
mutate(low = rq_low$coefficients[1] + HeightShoe$Height * rq_low$coefficients[2],
high = rq_high$coefficients[1] + HeightShoe$Height * rq_high$coefficients[2]) %>%
ggplot() +
geom_point(aes(HeightShoe$Height, HeightShoe$Shoe), size = 4) +
geom_smooth(aes(HeightShoe$Height, HeightShoe$Shoe), method = "lm") +
geom_line(aes(HeightShoe$Height, low), linetype = "dashed", color = "red") +
geom_line(aes(HeightShoe$Height, high), linetype = "dashed", color = "red") +
theme_bw()
Converting the Gender variable into dummy/indicator variables is necessary to facilitate the inclusion of categorical data in regression analysis. The modified dataset is written to the working directory.
# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE) }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE) }
library(data.table)
# Convert categorical variable to factors
HeightShoe$Gender <- as.factor(HeightShoe$Gender)
HeightShoe <- one_hot(as.data.table(HeightShoe))
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(HeightShoe, 5)
## Height Gender_Female Gender_Male Shoe ID Resid Fit StdRes
## <num> <int> <int> <num> <int> <num> <num> <num>
## 1: 157.5 1 0 34.6 1 0.46503 34.13 0.32035
## 2: 170.2 1 0 38.4 2 -1.11874 39.52 -0.77068
## 3: 178.1 0 1 42.8 3 -0.06769 42.87 -0.04663
## 4: 165.6 1 0 38.2 4 0.63129 37.57 0.43488
## 5: 170.4 1 0 37.4 5 -2.20352 39.60 -1.51797
cat("\n\nList of Columns (Variables):\n")
##
##
## List of Columns (Variables):
names(HeightShoe)
## [1] "Height" "Gender_Female" "Gender_Male" "Shoe"
## [5] "ID" "Resid" "Fit" "StdRes"
# Write Data to WD
write.csv(HeightShoe, file = paste(today, "HeightShoeInd.csv"))
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Perform a regression Shoe = b0 + b1 Height
lm1 <- lm(Shoe ~ Height + Gender_Male , data = HeightShoe)
# Add Residuals to Data Frame
HeightShoe$Resid <- lm1$residuals
HeightShoe$Fit <- lm1$fitted.values
MeanRes <- mean(HeightShoe$Resid)
SDRes <- sd(HeightShoe$Resid)
HeightShoe$StdRes <- (HeightShoe$Resid - MeanRes) / SDRes
# Display results
summary(lm1)
##
## Call:
## lm(formula = Shoe ~ Height + Gender_Male, data = HeightShoe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.691 -0.670 0.175 0.649 1.954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.7805 5.7761 -1.00 0.32
## Height 0.2576 0.0349 7.39 0.0000000086 ***
## Gender_Male 3.3125 0.5486 6.04 0.0000005579 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.06 on 37 degrees of freedom
## Multiple R-squared: 0.921, Adjusted R-squared: 0.917
## F-statistic: 215 on 2 and 37 DF, p-value: <0.0000000000000002
anova(lm1)
## Analysis of Variance Table
##
## Response: Shoe
## Df Sum Sq Mean Sq F value Pr(>F)
## Height 1 441 441 393.9 < 0.0000000000000002 ***
## Gender_Male 1 41 41 36.5 0.00000056 ***
## Residuals 37 41 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted Values
plot(HeightShoe$Fit, lm1$residuals, main = "Residuals vs Fitted", xlab = "Fitted values", ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(lm1$residuals, main = "Normal Q-Q")
qqline(lm1$residuals, col = "red")
# Over Time
plot(HeightShoe$ID, HeightShoe$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Time (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs Leverage
plot(lm1, which = 5, main = "Cook's Distance vs Leverage")
# Reset plotting environment
par(mfrow = c(1, 1))
# Get the ANOVA table
anova_table <- anova(lm1)
# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`
# Prepare data for the pie chart
SSdf <- data.frame(
Component = rownames(anova_table),
SumSq = SumSq
)
# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]
# Summarize the last three rows
# summary_row <- SSdf %>%
# slice_tail(n = 2) %>%
# summarise(Component = "Residual", SumSq = sum(SumSq))
# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]
# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)
# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq)
# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")
# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)
SSdf
## Component SumSq Percent Labels
## 1 Height 440.66 84.28 % Height\n441\n84.28 %
## 2 Residuals 41.39 7.92 % Residuals\n41\n7.92 %
## 3 Gender_Male 40.79 7.8 % Gender_Male\n41\n7.8 %
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")