This chunk imports ggplot2 for data visualization and dplyr for data manipulation, both of which are essential tools in R for data analysis. Including these libraries ensures a robust set of functions to preprocess and visualize the dataset.
# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
This chunk loads the dataset from a CSV file into the variable data. Ensure that the CSV file is in the working directory, or provide the full path to avoid errors.
# Setting working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
# Set number of decimals
options(digits = 4) # Modify global options
# Download Data from URL
Removes rows with missing values using na.omit() to avoid issues during regression modeling. Optionally, imputation methods could be used instead of removing rows if preserving data is critical. dim(data) checks the dataset’s dimensions post-cleaning.
# Remove rows with missing values to ensure a complete dataset for analysis
cereal <- na.omit(cereal)
# Check dataset dimensions after cleaning
dim(cereal)
## [1] 76 16
This includes splitting the data in training and test dataset, i.e., dftrain and dftest. Additionally, all numeric columns of the dftrain dataset are saved into dfNum for further analysis.
# Select columns for dfShort3
df <- data.frame(cereal)
names(df) <- str_to_title(names(df))
names(df) <- substr(names(df), 1, 4)
# Load Library if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)
# Split data into train and test
set.seed(1234)
split <- initial_split(df, prop = 0.80)
dftrain <- training(split)
dftest <- testing(split)
# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
# Select numeric columns for correlation matrix
dfNum <- data.frame(select_if(dftrain, is.numeric))
names(dfNum) <- str_to_title(names(dfNum))
names(dfNum) <- substr(names(dfNum), 1, 4)
names(dfNum)
## [1] "Calo" "Prot" "Fat" "Sodi" "Fibe" "Carb" "Suga" "Pota" "Vita" "Shel"
## [11] "Weig" "Cups" "Rati"
# Write data to working directory
write.csv(cereal, file = "CerealClean.csv")
write.csv(dftrain, file = "CerealTrain.csv")
write.csv(dftest, file = "CerealTest.csv")
The str() function provides a summary of the dataset’s structure, such as variable names, data types, and dimensions. The head() function displays the first six rows to give a quick preview of the dataset’s content. Add narrations to interpret key observations about the variables, e.g., “This dataset contains numerical and categorical variables.” Provides summary statistics (mean, median, min, max, etc.) for all numerical variables. These statistics help understand the distribution and spread of data, which may inform preprocessing steps such as normalisation.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
dftrain$ID <- 1:nrow(dftrain)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(dftrain)
## [1] "Name" "Mfr" "Type" "Calo" "Prot" "Fat" "Sodi" "Fibe" "Carb" "Suga"
## [11] "Pota" "Vita" "Shel" "Weig" "Cups" "Rati" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(dftrain)
## 'data.frame': 60 obs. of 17 variables:
## $ Name: chr "Fruit & Fibre Dates; Walnuts; and Oats" "Crispix" "Bran Chex" "Almond Delight" ...
## $ Mfr : chr "P" "K" "R" "R" ...
## $ Type: chr "C" "C" "C" "C" ...
## $ Calo: int 120 110 90 110 110 110 50 110 50 110 ...
## $ Prot: int 3 2 2 2 1 2 4 3 2 2 ...
## $ Fat : int 2 0 1 2 0 0 0 2 0 0 ...
## $ Sodi: int 160 220 200 200 180 280 140 140 0 290 ...
## $ Fibe: num 5 1 4 1 0 0 14 2 1 0 ...
## $ Carb: num 12 21 15 14 14 22 8 13 10 22 ...
## $ Suga: int 10 3 6 8 11 3 0 7 0 3 ...
## $ Pota: int 200 30 125 -1 35 25 330 105 50 35 ...
## $ Vita: int 25 25 25 25 25 25 25 25 0 25 ...
## $ Shel: int 3 3 1 3 1 1 3 3 3 1 ...
## $ Weig: num 1.25 1 1 1 1 1 1 1 0.5 1 ...
## $ Cups: num 0.67 1 0.67 0.75 1.33 1 0.5 0.5 1 1 ...
## $ Rati: num 40.9 46.9 49.1 34.4 28.7 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(dftrain, 5)
## Name Mfr Type Calo Prot Fat Sodi Fibe Carb
## 1 Fruit & Fibre Dates; Walnuts; and Oats P C 120 3 2 160 5 12
## 2 Crispix K C 110 2 0 220 1 21
## 3 Bran Chex R C 90 2 1 200 4 15
## 4 Almond Delight R C 110 2 2 200 1 14
## 5 Honey-comb P C 110 1 0 180 0 14
## Suga Pota Vita Shel Weig Cups Rati ID
## 1 10 200 25 3 1.25 0.67 40.92 1
## 2 3 30 25 3 1.00 1.00 46.90 2
## 3 6 125 25 1 1.00 0.67 49.12 3
## 4 8 -1 25 3 1.00 0.75 34.38 4
## 5 11 35 25 1 1.00 1.33 28.74 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(dftrain, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 Type 60
## 2 ... C 58 96.667%
## 3 ... H 2 3.333%
## 4 Calo 60 107.33 20.901 50 100 110 112.5 160 0
## 5 Prot 60 2.4667 1.0328 1 2 2 3 6 0
## 6 Fat 60 1.0667 1.0555 0 0 1 2 5 0
## 7 Sodi 60 163.83 81.741 0 135 180 220 320 0
## 8 Fibe 60 2.0917 2.373 0 0.75 1.75 3 14 0
## 9 Carb 60 14.625 3.7784 7 12 14 17 22 0
## 10 Suga 60 7.2333 4.3193 0 3 7 11 15 0
## 11 Pota 60 94.717 72.323 -1 40 90 121.25 330 0
## 12 Vita 60 29.167 22.628 0 25 25 25 100 0
## 13 Shel 60 2.2667 0.82064 1 2 2.5 3 3 0
## 14 Weig 60 1.0283 0.16307 0.5 1 1 1 1.5 0
## 15 Cups 60 0.8345 0.22424 0.33 0.67 0.815 1 1.5 0
## 16 Rati 60 41.586 13.399 19.824 32.015 39.905 49.218 93.705 0
## 17 ID 60 30.5 17.464 1 15.75 30.5 45.25 60 0
The correlation matrix of all variables hints to potential correlations between Xs and between Xs and Y (Rati … Rating)
# Plot Correlation Matrix of Numeric Values
corrMat <- cor(dfNum)
# Display results
round(corrMat, 2)
## Calo Prot Fat Sodi Fibe Carb Suga Pota Vita Shel Weig Cups
## Calo 1.00 0.10 0.52 0.26 -0.23 0.29 0.59 0.00 0.27 0.10 0.72 0.05
## Prot 0.10 1.00 0.19 0.02 0.50 0.02 -0.26 0.57 0.11 0.27 0.26 -0.29
## Fat 0.52 0.19 1.00 -0.10 0.04 -0.29 0.29 0.22 -0.07 0.31 0.21 -0.25
## Sodi 0.26 0.02 -0.10 1.00 0.02 0.39 0.01 0.02 0.32 -0.18 0.34 0.02
## Fibe -0.23 0.50 0.04 0.02 1.00 -0.35 -0.13 0.90 0.04 0.35 0.27 -0.49
## Carb 0.29 0.02 -0.29 0.39 -0.35 1.00 -0.41 -0.36 0.25 -0.13 0.19 0.39
## Suga 0.59 -0.26 0.29 0.01 -0.13 -0.41 1.00 0.05 0.11 0.01 0.48 -0.08
## Pota 0.00 0.57 0.22 0.02 0.90 -0.36 0.05 1.00 0.10 0.44 0.41 -0.49
## Vita 0.27 0.11 -0.07 0.32 0.04 0.25 0.11 0.10 1.00 0.24 0.38 0.08
## Shel 0.10 0.27 0.31 -0.18 0.35 -0.13 0.01 0.44 0.24 1.00 0.20 -0.34
## Weig 0.72 0.26 0.21 0.34 0.27 0.19 0.48 0.41 0.38 0.20 1.00 -0.21
## Cups 0.05 -0.29 -0.25 0.02 -0.49 0.39 -0.08 -0.49 0.08 -0.34 -0.21 1.00
## Rati -0.69 0.45 -0.40 -0.31 0.56 0.05 -0.77 0.34 -0.19 0.12 -0.32 -0.14
## Rati
## Calo -0.69
## Prot 0.45
## Fat -0.40
## Sodi -0.31
## Fibe 0.56
## Carb 0.05
## Suga -0.77
## Pota 0.34
## Vita -0.19
## Shel 0.12
## Weig -0.32
## Cups -0.14
## Rati 1.00
The correlation matrix of all variables displays potential correlations between Xs and between Xs and Y (Rati … Rating)
# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(dfNum),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
Fits a linear regression model with target Rati as the dependent variable and Suga and Fibe as predictors. Choice of predictors is based on domain knowledge or exploratory analysis.
# Perform a regression
lm <- lm(Rati ~ Suga + Fibe, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.37 -4.09 -1.07 2.87 16.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.052 1.732 30.1 < 2e-16 ***
## Suga -2.206 0.180 -12.2 < 2e-16 ***
## Fibe 2.626 0.328 8.0 6.9e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.93 on 57 degrees of freedom
## Multiple R-squared: 0.811, Adjusted R-squared: 0.804
## F-statistic: 122 on 2 and 57 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 180 < 2e-16 ***
## Fibe 1 2251 2251 64 6.9e-11 ***
## Residuals 57 2006 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load library if not avalable
if(! "Metrics" %in% installed.packages()) { install.packages("Metrics", dependencies = TRUE) }
library(Metrics)
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.782
A 3-D plot is a visual display of two predictors, i.e., sugars and Fiber, and their target variable Rating.
# Load library if not avalable
if(! "scatterplot3d" %in% installed.packages()) { install.packages("scatterplot3d", dependencies = TRUE) }
library(scatterplot3d)
# Color by Rating
rg <- colorRampPalette(c("red", "green"))(length(dftrain$Rati))
sp <- scatterplot3d(z = dftrain$Rati, y = dftrain$Suga, x = dftrain$Fibe,
color = rg,
pch = 16,
xlab = "Fiber",
ylab = "Sugars",
zlab = "Rating",
main = "3D Scatterplot of Rating vs Fiber and Sugars")
The linear model can be used to predict certain Ratings for given Sugars and Fiber. This prediction is not 100% accurate and must be shown with its prediction and confidence interval.
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Set x for prediction
cat("\n\nMake Prediction for New Sugars values 4, 5, 10 and Fiber values 5, 5, 10:\n")
##
##
## Make Prediction for New Sugars values 4, 5, 10 and Fiber values 5, 5, 10:
newRating <- data.frame(Suga = c(4, 5, 10), Fibe = c(4, 5, 10))
# Show new values table
cat("\n\nShow new values table:\n")
##
##
## Show new values table:
newRating
## Suga Fibe
## 1 4 4
## 2 5 5
## 3 10 10
# Calculate Confidence Interval
cat("\nConfidence Intervals:\n")
##
## Confidence Intervals:
predict(lm, newdata = newRating, se.fit = FALSE, interval = "confidence", level = 0.95)
## fit lwr upr
## 1 53.73 51.52 55.95
## 2 54.15 51.65 56.65
## 3 56.25 50.62 61.89
# Calculate Prediction Interval
cat("\nPrediction Intervals:\n")
##
## Prediction Intervals:
predict(lm, newdata = newRating, se.fit = FALSE, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 53.73 41.65 65.82
## 2 54.15 42.01 66.29
## 3 56.25 43.10 69.40
High leverage points and influential observations have the power to influence the model over proportionally. Therefore, it is key to find them and confirm them not being outliers.
# Calculating Cook's Distance
cutoff <- 4/(nrow(dftrain)-length(lm$coefficients)-2)
# Plot Cook's Distance for each observation
plot(lm, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")
# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm)
dftrain[which(abs(cooks) > cooks_crit),]
## [1] Name Mfr Type Calo Prot Fat Sodi Fibe Carb Suga
## [11] Pota Vita Shel Weig Cups Rati ID Resid Fit StdRes
## <0 rows> (or 0-length row.names)
Variable Shelf could not be inspected using the correlation matrix since Shelf is showing categories. Shelf 1, 2 and 3 must not be analysed as numbers but must be coded as flag variables, i.e., indicator variables. For a three category variable, two indicator variables are needed.
# Shelf 3 is base, i.e., we need to create Indicator Varibales for Shelf 1 and Shelf 2
dftrain$Shelf1 <- ifelse(dftrain$Shel == 1, 1, 0)
dftrain$Shelf2 <- ifelse(dftrain$Shel == 2, 1, 0)
# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.481 -4.093 -0.589 2.935 16.798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.964 2.080 24.50 < 2e-16 ***
## Suga -2.243 0.187 -12.02 < 2e-16 ***
## Fibe 2.825 0.371 7.62 3.7e-10 ***
## Shelf1 1.141 2.029 0.56 0.58
## Shelf2 2.531 2.101 1.20 0.23
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.96 on 55 degrees of freedom
## Multiple R-squared: 0.816, Adjusted R-squared: 0.802
## F-statistic: 60.8 on 4 and 55 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 178.35 <2e-16 ***
## Fibe 1 2251 2251 63.37 1e-10 ***
## Shelf1 1 0 0 0.01 0.93
## Shelf2 1 52 52 1.45 0.23
## Residuals 55 1954 36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.706
Potassium is being added to the regression model since it shows a strong relationship with Rating.
# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.884 -3.801 -0.667 2.836 16.067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.2550 2.2330 23.40 < 2e-16 ***
## Suga -2.1085 0.2056 -10.25 2.8e-14 ***
## Fibe 3.9095 0.8161 4.79 1.3e-05 ***
## Pota -0.0423 0.0284 -1.49 0.14
## Shelf1 0.2509 2.0945 0.12 0.91
## Shelf2 1.3339 2.2285 0.60 0.55
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.9 on 54 degrees of freedom
## Multiple R-squared: 0.823, Adjusted R-squared: 0.806
## F-statistic: 50.2 on 5 and 54 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 182.29 < 2e-16 ***
## Fibe 1 2251 2251 64.76 8.3e-11 ***
## Pota 1 115 115 3.32 0.074 .
## Shelf1 1 1 1 0.03 0.862
## Shelf2 1 12 12 0.36 0.552
## Residuals 54 1877 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.593
Multicollinearity, i.e., a strong correlation between predictors must be avoided or at least observed with caution because multicollinearity has the power to influence and damage the regression model.
# Make dftrain with 4 Predictors
dfShort4 <- data.frame(Sugar = dftrain$Suga, Fiber = dftrain$Fibe, Shelf1 = dftrain$Shelf1, Shelf2 = dftrain$Shelf2, Potass = dftrain$Pota)
# Plot Correlation Matrix of 4 Predictors
corrMat <- cor(dfShort4)
# Display results
round(corrMat, 2)
## Sugar Fiber Shelf1 Shelf2 Potass
## Sugar 1.00 -0.13 -0.14 0.26 0.05
## Fiber -0.13 1.00 -0.14 -0.38 0.90
## Shelf1 -0.14 -0.14 1.00 -0.33 -0.21
## Shelf2 0.26 -0.38 -0.33 1.00 -0.40
## Potass 0.05 0.90 -0.21 -0.40 1.00
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(dfShort4),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
Multicollinearity can also be calculated using the Variance Inflation Factor (VIF). Depending on the VIF for each factor, there could be a need to remove a factor to correct the model and avoid damage.
# Setting up the model
lm <- lm(Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.884 -3.801 -0.667 2.836 16.067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.2550 2.2330 23.40 < 2e-16 ***
## Suga -2.1085 0.2056 -10.25 2.8e-14 ***
## Fibe 3.9095 0.8161 4.79 1.3e-05 ***
## Pota -0.0423 0.0284 -1.49 0.14
## Shelf1 0.2509 2.0945 0.12 0.91
## Shelf2 1.3339 2.2285 0.60 0.55
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.9 on 54 degrees of freedom
## Multiple R-squared: 0.823, Adjusted R-squared: 0.806
## F-statistic: 50.2 on 5 and 54 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 182.29 < 2e-16 ***
## Fibe 1 2251 2251 64.76 8.3e-11 ***
## Pota 1 115 115 3.32 0.074 .
## Shelf1 1 1 1 0.03 0.862
## Shelf2 1 12 12 0.36 0.552
## Residuals 54 1877 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.593
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
##
##
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
## Suga Fibe Pota Shelf1 Shelf2
## 1.339 6.367 7.184 1.355 1.676
Since Potassium and Fiber both show a high VIF, Potassium will be removed. Potassium is harder to influence/measure.
# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.481 -4.093 -0.589 2.935 16.798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.964 2.080 24.50 < 2e-16 ***
## Suga -2.243 0.187 -12.02 < 2e-16 ***
## Fibe 2.825 0.371 7.62 3.7e-10 ***
## Shelf1 1.141 2.029 0.56 0.58
## Shelf2 2.531 2.101 1.20 0.23
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.96 on 55 degrees of freedom
## Multiple R-squared: 0.816, Adjusted R-squared: 0.802
## F-statistic: 60.8 on 4 and 55 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 178.35 <2e-16 ***
## Fibe 1 2251 2251 63.37 1e-10 ***
## Shelf1 1 0 0 0.01 0.93
## Shelf2 1 52 52 1.45 0.23
## Residuals 55 1954 36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.706
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
##
##
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
## Suga Fibe Shelf1 Shelf2
## 1.079 1.286 1.244 1.458
The resulting model shows high p-values for Shelf1 and Shelf2. Shelf1 will be removed first.
# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf2, data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf2, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.521 -3.507 -0.845 2.886 17.476
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.546 1.793 28.75 < 2e-16 ***
## Suga -2.251 0.185 -12.17 < 2e-16 ***
## Fibe 2.761 0.351 7.87 1.3e-10 ***
## Shelf2 2.060 1.916 1.08 0.29
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.92 on 56 degrees of freedom
## Multiple R-squared: 0.814, Adjusted R-squared: 0.805
## F-statistic: 82 on 3 and 56 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 180.56 < 2e-16 ***
## Fibe 1 2251 2251 64.15 7.5e-11 ***
## Shelf2 1 41 41 1.16 0.29
## Residuals 56 1965 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.723
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
##
##
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
## Suga Fibe Shelf2
## 1.073 1.166 1.227
Shelf2 shows a p-value of 0.29, i.e., it will be removed as well.
# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe , data = dftrain)
# Add Residuals to Data Frame
dftrain$Resid <- lm$residuals
dftrain$Fit <- lm$fitted.values
MeanRes <- mean(dftrain$Resid)
SDRes <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes
# Display results
summary(lm)
##
## Call:
## lm(formula = Rati ~ Suga + Fibe, data = dftrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.37 -4.09 -1.07 2.87 16.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.052 1.732 30.1 < 2e-16 ***
## Suga -2.206 0.180 -12.2 < 2e-16 ***
## Fibe 2.626 0.328 8.0 6.9e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.93 on 57 degrees of freedom
## Multiple R-squared: 0.811, Adjusted R-squared: 0.804
## F-statistic: 122 on 2 and 57 DF, p-value: <2e-16
anova(lm)
## Analysis of Variance Table
##
## Response: Rati
## Df Sum Sq Mean Sq F value Pr(>F)
## Suga 1 6336 6336 180 < 2e-16 ***
## Fibe 1 2251 2251 64 6.9e-11 ***
## Residuals 57 2006 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
##
##
## RMSE: 5.782
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
##
##
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
## Suga Fibe
## 1.018 1.018
The final model shows all p-values to be significant and an R-squared adjusted of 80.8%. Now residual assumptions are to be checked.
# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqPlot(lm$residuals, col.lines = "pink", col = "darkblue", xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
## [1] 19 56
# Test normality of residuals
cat("\n\nNormal distribution indicator of Residuals")
##
##
## Normal distribution indicator of Residuals
p.value <- shapiro.test(lm$residuals)$p.value
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
##
## p-value: 0.04378
cat("\nIf p-value > 0.05, data are normally distributed.")
##
## If p-value > 0.05, data are normally distributed.
if (p.value > 0.05) {
cat("\n\nThis means, residuals are normally distributed.")
} else {
cat("\n\nThis means, residuals are not normally distributed.")
}
##
##
## This means, residuals are not normally distributed.
Produces diagnostic plots (residual vs. fitted, Q-Q plot, scale-location, residuals vs. leverage). The Q-Q plot checks the normality of residuals, and deviations indicate non-normality. All other plots must not show any pattern since patterns in these plots hint to a lurking variable that must be found and included.
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted
plot(lm$fitted.values, lm$residuals, main = "Residuals vs Fitted", xlab = "Fitted values", ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(lm$residuals, main = "Normal Q-Q")
qqline(lm$residuals, col = "red")
# Residuals vs Time (ID)
plot(dftrain$ID, dftrain$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Observation Order (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs Leverage
plot(lm, which = 5, main = "Cook's Distance vs Leverage")
# Reset plotting environment
par(mfrow = c(1, 1))
Plots of residuals against Xs are to be checked for patterns. No patterns is good news.
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs X
plot(dftrain$Fibe, lm$residuals, main = "Residuals vs Fiber", xlab = "Fiber", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs X
plot(dftrain$Pota, lm$residuals, main = "Residuals vs Potassium", xlab = "Potassium", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs X
plot(dftrain$Shelf1, lm$residuals, main = "Residuals vs Shelf", xlab = "Shelf", ylab = "Residuals")
abline(h = 0, col = "red")
# Residuals vs X
plot(dftrain$Suga, lm$residuals, main = "Residuals vs Sugar", xlab = "Sugar", ylab = "Residuals")
abline(h = 0, col = "red")
# Reset plotting environment
par(mfrow = c(1, 1))
# Load library if not avalable
if(! "Metrics" %in% installed.packages()) { install.packages("Metrics", dependencies = TRUE) }
library(Metrics)
# Make predictions
predictTrain <- predict(lm, dftrain)
predictTest <- predict(lm, dftest)
# Model performance
# Compute the prediction error, RMSE for dftrain
RMSE <- rmse(predictTrain, dftrain$Rati)
cat("\n\nThe average difference between the actual values for Rating and the predicted ones using the training data is ", round(RMSE, 2) , ".\n")
##
##
## The average difference between the actual values for Rating and the predicted ones using the training data is 5.78 .
# Compute the prediction error, RMSE for dftest
RMSE <- rmse(predictTest, dftest$Rati)
cat("\nThe average difference between the actual values for Rating and the predicted ones using the test data is ", round(RMSE, 2) , ".\n")
##
## The average difference between the actual values for Rating and the predicted ones using the test data is 6.96 .
# Please also read: http://www.sthda.com/english/articles/40-regression-analysis/165-linear-regression-essentials-in-r/
Preparing data for visualizing the sum of squares from the ANOVA table. Includes calculating percentages and cumulative sums for better presentation.
# Get the ANOVA table
anova_table <- anova(lm)
# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`
# Prepare data for the pie chart
SSdf <- data.frame(
Component = rownames(anova_table),
SumSq = SumSq
)
# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]
# Summarize the last three rows
# summary_row <- SSdf %>%
# slice_tail(n = 2) %>%
# summarise(Component = "Residual", SumSq = sum(SumSq))
# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]
# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)
# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq)
# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")
# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)
SSdf
## Component SumSq Percent Labels
## 1 Suga 6336 59.81 % Suga\n6336\n59.81 %
## 2 Fibe 2251 21.25 % Fibe\n2251\n21.25 %
## 3 Residuals 2006 18.93 % Residuals\n2006\n18.93 %
Creating a pie chart to illustrate the contribution of each regression component to the total sum of squares. Enhances understanding of model variance attribution.
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")