1 Loading Libraries

This chunk imports ggplot2 for data visualization and dplyr for data manipulation, both of which are essential tools in R for data analysis. Including these libraries ensures a robust set of functions to preprocess and visualize the dataset.

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)

if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

2 Getting Data

This chunk loads the dataset from a CSV file into the variable data. Ensure that the CSV file is in the working directory, or provide the full path to avoid errors.

# Setting working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Set number of decimals
options(digits = 4)              # Modify global options

# Download Data from URL

3 Cleaning Data

Removes rows with missing values using na.omit() to avoid issues during regression modeling. Optionally, imputation methods could be used instead of removing rows if preserving data is critical. dim(data) checks the dataset’s dimensions post-cleaning.

# Remove rows with missing values to ensure a complete dataset for analysis
cereal <- na.omit(cereal)

# Check dataset dimensions after cleaning
dim(cereal)
## [1] 76 16

4 Preparing Data

This includes splitting the data in training and test dataset, i.e., dftrain and dftest. Additionally, all numeric columns of the dftrain dataset are saved into dfNum for further analysis.

# Select columns for dfShort3
df        <- data.frame(cereal)
names(df) <- str_to_title(names(df))
names(df) <- substr(names(df), 1, 4)

# Load Library if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)

# Split data into train and test
set.seed(1234)
split   <- initial_split(df, prop = 0.80)
dftrain <- training(split)
dftest  <- testing(split)

# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Select numeric columns for correlation matrix
dfNum  <- data.frame(select_if(dftrain, is.numeric))
names(dfNum) <- str_to_title(names(dfNum))
names(dfNum) <- substr(names(dfNum), 1, 4)
names(dfNum)
##  [1] "Calo" "Prot" "Fat"  "Sodi" "Fibe" "Carb" "Suga" "Pota" "Vita" "Shel"
## [11] "Weig" "Cups" "Rati"
# Write data to working directory
write.csv(cereal,  file = "CerealClean.csv")
write.csv(dftrain, file = "CerealTrain.csv")
write.csv(dftest,  file = "CerealTest.csv")

5 Exploring Data

The str() function provides a summary of the dataset’s structure, such as variable names, data types, and dimensions. The head() function displays the first six rows to give a quick preview of the dataset’s content. Add narrations to interpret key observations about the variables, e.g., “This dataset contains numerical and categorical variables.” Provides summary statistics (mean, median, min, max, etc.) for all numerical variables. These statistics help understand the distribution and spread of data, which may inform preprocessing steps such as normalisation.

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Add ID to Data Frame
dftrain$ID <- 1:nrow(dftrain)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(dftrain)
##  [1] "Name" "Mfr"  "Type" "Calo" "Prot" "Fat"  "Sodi" "Fibe" "Carb" "Suga"
## [11] "Pota" "Vita" "Shel" "Weig" "Cups" "Rati" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(dftrain)
## 'data.frame':    60 obs. of  17 variables:
##  $ Name: chr  "Fruit & Fibre Dates; Walnuts; and Oats" "Crispix" "Bran Chex" "Almond Delight" ...
##  $ Mfr : chr  "P" "K" "R" "R" ...
##  $ Type: chr  "C" "C" "C" "C" ...
##  $ Calo: int  120 110 90 110 110 110 50 110 50 110 ...
##  $ Prot: int  3 2 2 2 1 2 4 3 2 2 ...
##  $ Fat : int  2 0 1 2 0 0 0 2 0 0 ...
##  $ Sodi: int  160 220 200 200 180 280 140 140 0 290 ...
##  $ Fibe: num  5 1 4 1 0 0 14 2 1 0 ...
##  $ Carb: num  12 21 15 14 14 22 8 13 10 22 ...
##  $ Suga: int  10 3 6 8 11 3 0 7 0 3 ...
##  $ Pota: int  200 30 125 -1 35 25 330 105 50 35 ...
##  $ Vita: int  25 25 25 25 25 25 25 25 0 25 ...
##  $ Shel: int  3 3 1 3 1 1 3 3 3 1 ...
##  $ Weig: num  1.25 1 1 1 1 1 1 1 0.5 1 ...
##  $ Cups: num  0.67 1 0.67 0.75 1.33 1 0.5 0.5 1 1 ...
##  $ Rati: num  40.9 46.9 49.1 34.4 28.7 ...
##  $ ID  : int  1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(dftrain, 5)
##                                     Name Mfr Type Calo Prot Fat Sodi Fibe Carb
## 1 Fruit & Fibre Dates; Walnuts; and Oats   P    C  120    3   2  160    5   12
## 2                                Crispix   K    C  110    2   0  220    1   21
## 3                              Bran Chex   R    C   90    2   1  200    4   15
## 4                         Almond Delight   R    C  110    2   2  200    1   14
## 5                             Honey-comb   P    C  110    1   0  180    0   14
##   Suga Pota Vita Shel Weig Cups  Rati ID
## 1   10  200   25    3 1.25 0.67 40.92  1
## 2    3   30   25    3 1.00 1.00 46.90  2
## 3    6  125   25    1 1.00 0.67 49.12  3
## 4    8   -1   25    3 1.00 0.75 34.38  4
## 5   11   35   25    1 1.00 1.33 28.74  5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(dftrain, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##    Variable  N    Mean      SD    Min    P25    P50    P75    Max NA Mode
## 1      Type 60                                                           
## 2     ... C 58 96.667%                                                   
## 3     ... H  2  3.333%                                                   
## 4      Calo 60  107.33  20.901     50    100    110  112.5    160  0     
## 5      Prot 60  2.4667  1.0328      1      2      2      3      6  0     
## 6       Fat 60  1.0667  1.0555      0      0      1      2      5  0     
## 7      Sodi 60  163.83  81.741      0    135    180    220    320  0     
## 8      Fibe 60  2.0917   2.373      0   0.75   1.75      3     14  0     
## 9      Carb 60  14.625  3.7784      7     12     14     17     22  0     
## 10     Suga 60  7.2333  4.3193      0      3      7     11     15  0     
## 11     Pota 60  94.717  72.323     -1     40     90 121.25    330  0     
## 12     Vita 60  29.167  22.628      0     25     25     25    100  0     
## 13     Shel 60  2.2667 0.82064      1      2    2.5      3      3  0     
## 14     Weig 60  1.0283 0.16307    0.5      1      1      1    1.5  0     
## 15     Cups 60  0.8345 0.22424   0.33   0.67  0.815      1    1.5  0     
## 16     Rati 60  41.586  13.399 19.824 32.015 39.905 49.218 93.705  0     
## 17       ID 60    30.5  17.464      1  15.75   30.5  45.25     60  0

6 Computing Correlation Matrix

The correlation matrix of all variables hints to potential correlations between Xs and between Xs and Y (Rati … Rating)

# Plot Correlation Matrix of Numeric Values
corrMat <- cor(dfNum)

# Display results 
round(corrMat, 2)
##       Calo  Prot   Fat  Sodi  Fibe  Carb  Suga  Pota  Vita  Shel  Weig  Cups
## Calo  1.00  0.10  0.52  0.26 -0.23  0.29  0.59  0.00  0.27  0.10  0.72  0.05
## Prot  0.10  1.00  0.19  0.02  0.50  0.02 -0.26  0.57  0.11  0.27  0.26 -0.29
## Fat   0.52  0.19  1.00 -0.10  0.04 -0.29  0.29  0.22 -0.07  0.31  0.21 -0.25
## Sodi  0.26  0.02 -0.10  1.00  0.02  0.39  0.01  0.02  0.32 -0.18  0.34  0.02
## Fibe -0.23  0.50  0.04  0.02  1.00 -0.35 -0.13  0.90  0.04  0.35  0.27 -0.49
## Carb  0.29  0.02 -0.29  0.39 -0.35  1.00 -0.41 -0.36  0.25 -0.13  0.19  0.39
## Suga  0.59 -0.26  0.29  0.01 -0.13 -0.41  1.00  0.05  0.11  0.01  0.48 -0.08
## Pota  0.00  0.57  0.22  0.02  0.90 -0.36  0.05  1.00  0.10  0.44  0.41 -0.49
## Vita  0.27  0.11 -0.07  0.32  0.04  0.25  0.11  0.10  1.00  0.24  0.38  0.08
## Shel  0.10  0.27  0.31 -0.18  0.35 -0.13  0.01  0.44  0.24  1.00  0.20 -0.34
## Weig  0.72  0.26  0.21  0.34  0.27  0.19  0.48  0.41  0.38  0.20  1.00 -0.21
## Cups  0.05 -0.29 -0.25  0.02 -0.49  0.39 -0.08 -0.49  0.08 -0.34 -0.21  1.00
## Rati -0.69  0.45 -0.40 -0.31  0.56  0.05 -0.77  0.34 -0.19  0.12 -0.32 -0.14
##       Rati
## Calo -0.69
## Prot  0.45
## Fat  -0.40
## Sodi -0.31
## Fibe  0.56
## Carb  0.05
## Suga -0.77
## Pota  0.34
## Vita -0.19
## Shel  0.12
## Weig -0.32
## Cups -0.14
## Rati  1.00

7 Showing Correlation Matrix

The correlation matrix of all variables displays potential correlations between Xs and between Xs and Y (Rati … Rating)

# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)

# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(dfNum),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

8 Run Regression

Fits a linear regression model with target Rati as the dependent variable and Suga and Fibe as predictors. Choice of predictors is based on domain knowledge or exploratory analysis.

# Perform a regression 
lm <- lm(Rati ~ Suga + Fibe, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe, data = dftrain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12.37  -4.09  -1.07   2.87  16.30 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.052      1.732    30.1  < 2e-16 ***
## Suga          -2.206      0.180   -12.2  < 2e-16 ***
## Fibe           2.626      0.328     8.0  6.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.93 on 57 degrees of freedom
## Multiple R-squared:  0.811,  Adjusted R-squared:  0.804 
## F-statistic:  122 on 2 and 57 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Suga       1   6336    6336     180 < 2e-16 ***
## Fibe       1   2251    2251      64 6.9e-11 ***
## Residuals 57   2006      35                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load library if not avalable
if(! "Metrics" %in% installed.packages()) { install.packages("Metrics", dependencies = TRUE) }
library(Metrics)

# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.782

9 Displaying 3-D Plot

A 3-D plot is a visual display of two predictors, i.e., sugars and Fiber, and their target variable Rating.

# Load library if not avalable
if(! "scatterplot3d" %in% installed.packages()) { install.packages("scatterplot3d", dependencies = TRUE) }
library(scatterplot3d)

# Color by Rating
rg <- colorRampPalette(c("red", "green"))(length(dftrain$Rati))
sp <- scatterplot3d(z = dftrain$Rati, y = dftrain$Suga, x = dftrain$Fibe,
color = rg,
pch  = 16,
xlab = "Fiber",
ylab = "Sugars",
zlab = "Rating",
main = "3D Scatterplot of Rating vs Fiber and Sugars")

10 Calculating Prediction with Confidence and Prediction Interval

The linear model can be used to predict certain Ratings for given Sugars and Fiber. This prediction is not 100% accurate and must be shown with its prediction and confidence interval.

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Set x for prediction
cat("\n\nMake Prediction for New Sugars values 4, 5, 10 and Fiber values 5, 5, 10:\n")
## 
## 
## Make Prediction for New Sugars values 4, 5, 10 and Fiber values 5, 5, 10:
newRating <- data.frame(Suga = c(4, 5, 10), Fibe = c(4, 5, 10))

# Show new values table
cat("\n\nShow new values table:\n")
## 
## 
## Show new values table:
newRating
##   Suga Fibe
## 1    4    4
## 2    5    5
## 3   10   10
# Calculate Confidence Interval
cat("\nConfidence Intervals:\n")
## 
## Confidence Intervals:
predict(lm, newdata = newRating, se.fit = FALSE, interval = "confidence", level = 0.95)
##     fit   lwr   upr
## 1 53.73 51.52 55.95
## 2 54.15 51.65 56.65
## 3 56.25 50.62 61.89
# Calculate Prediction Interval
cat("\nPrediction Intervals:\n")
## 
## Prediction Intervals:
predict(lm, newdata = newRating, se.fit = FALSE, interval = "prediction", level = 0.95)
##     fit   lwr   upr
## 1 53.73 41.65 65.82
## 2 54.15 42.01 66.29
## 3 56.25 43.10 69.40

11 Showing High Leverage Points and Influential Observations

High leverage points and influential observations have the power to influence the model over proportionally. Therefore, it is key to find them and confirm them not being outliers.

# Calculating Cook's Distance
cutoff <- 4/(nrow(dftrain)-length(lm$coefficients)-2)

# Plot Cook's Distance for each observation
plot(lm, which = 4, cook.levels=cutoff)
abline(h = cutoff, lty = 2, col = "red")

# Show influential observations
cooks_crit = 0.5
cooks <- cooks.distance(lm)
dftrain[which(abs(cooks) > cooks_crit),]
##  [1] Name   Mfr    Type   Calo   Prot   Fat    Sodi   Fibe   Carb   Suga  
## [11] Pota   Vita   Shel   Weig   Cups   Rati   ID     Resid  Fit    StdRes
## <0 rows> (or 0-length row.names)

12 Making Indicator Variables for Shelf and Running Regression

Variable Shelf could not be inspected using the correlation matrix since Shelf is showing categories. Shelf 1, 2 and 3 must not be analysed as numbers but must be coded as flag variables, i.e., indicator variables. For a three category variable, two indicator variables are needed.

# Shelf 3 is base, i.e., we need to create Indicator Varibales for Shelf 1 and Shelf 2
dftrain$Shelf1 <- ifelse(dftrain$Shel == 1, 1, 0)
dftrain$Shelf2 <- ifelse(dftrain$Shel == 2, 1, 0)

# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.481  -4.093  -0.589   2.935  16.798 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   50.964      2.080   24.50  < 2e-16 ***
## Suga          -2.243      0.187  -12.02  < 2e-16 ***
## Fibe           2.825      0.371    7.62  3.7e-10 ***
## Shelf1         1.141      2.029    0.56     0.58    
## Shelf2         2.531      2.101    1.20     0.23    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.96 on 55 degrees of freedom
## Multiple R-squared:  0.816,  Adjusted R-squared:  0.802 
## F-statistic: 60.8 on 4 and 55 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value Pr(>F)    
## Suga       1   6336    6336  178.35 <2e-16 ***
## Fibe       1   2251    2251   63.37  1e-10 ***
## Shelf1     1      0       0    0.01   0.93    
## Shelf2     1     52      52    1.45   0.23    
## Residuals 55   1954      36                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.706

13 Adding Potassium and Running Regression

Potassium is being added to the regression model since it shows a strong relationship with Rating.

# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.884  -3.801  -0.667   2.836  16.067 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.2550     2.2330   23.40  < 2e-16 ***
## Suga         -2.1085     0.2056  -10.25  2.8e-14 ***
## Fibe          3.9095     0.8161    4.79  1.3e-05 ***
## Pota         -0.0423     0.0284   -1.49     0.14    
## Shelf1        0.2509     2.0945    0.12     0.91    
## Shelf2        1.3339     2.2285    0.60     0.55    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.9 on 54 degrees of freedom
## Multiple R-squared:  0.823,  Adjusted R-squared:  0.806 
## F-statistic: 50.2 on 5 and 54 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Suga       1   6336    6336  182.29 < 2e-16 ***
## Fibe       1   2251    2251   64.76 8.3e-11 ***
## Pota       1    115     115    3.32   0.074 .  
## Shelf1     1      1       1    0.03   0.862    
## Shelf2     1     12      12    0.36   0.552    
## Residuals 54   1877      35                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.593

14 Exploring Correlation Between Predictors

Multicollinearity, i.e., a strong correlation between predictors must be avoided or at least observed with caution because multicollinearity has the power to influence and damage the regression model.

# Make dftrain with 4 Predictors
dfShort4 <- data.frame(Sugar = dftrain$Suga, Fiber = dftrain$Fibe, Shelf1 = dftrain$Shelf1, Shelf2 = dftrain$Shelf2, Potass = dftrain$Pota)

# Plot Correlation Matrix of 4 Predictors
corrMat <- cor(dfShort4)

# Display results 
round(corrMat, 2)
##        Sugar Fiber Shelf1 Shelf2 Potass
## Sugar   1.00 -0.13  -0.14   0.26   0.05
## Fiber  -0.13  1.00  -0.14  -0.38   0.90
## Shelf1 -0.14 -0.14   1.00  -0.33  -0.21
## Shelf2  0.26 -0.38  -0.33   1.00  -0.40
## Potass  0.05  0.90  -0.21  -0.40   1.00
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(dfShort4),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

15 Exploring Multicollinearity

Multicollinearity can also be calculated using the Variance Inflation Factor (VIF). Depending on the VIF for each factor, there could be a need to remove a factor to correct the model and avoid damage.

# Setting up the model
lm <- lm(Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe + Pota + Shelf1 + Shelf2, data = dftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.884  -3.801  -0.667   2.836  16.067 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.2550     2.2330   23.40  < 2e-16 ***
## Suga         -2.1085     0.2056  -10.25  2.8e-14 ***
## Fibe          3.9095     0.8161    4.79  1.3e-05 ***
## Pota         -0.0423     0.0284   -1.49     0.14    
## Shelf1        0.2509     2.0945    0.12     0.91    
## Shelf2        1.3339     2.2285    0.60     0.55    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.9 on 54 degrees of freedom
## Multiple R-squared:  0.823,  Adjusted R-squared:  0.806 
## F-statistic: 50.2 on 5 and 54 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Suga       1   6336    6336  182.29 < 2e-16 ***
## Fibe       1   2251    2251   64.76 8.3e-11 ***
## Pota       1    115     115    3.32   0.074 .  
## Shelf1     1      1       1    0.03   0.862    
## Shelf2     1     12      12    0.36   0.552    
## Residuals 54   1877      35                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.593
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
## 
## 
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
##   Suga   Fibe   Pota Shelf1 Shelf2 
##  1.339  6.367  7.184  1.355  1.676

16 Remove Pota and Run Regression

Since Potassium and Fiber both show a high VIF, Potassium will be removed. Potassium is harder to influence/measure.

# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf1 + Shelf2, data = dftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.481  -4.093  -0.589   2.935  16.798 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   50.964      2.080   24.50  < 2e-16 ***
## Suga          -2.243      0.187  -12.02  < 2e-16 ***
## Fibe           2.825      0.371    7.62  3.7e-10 ***
## Shelf1         1.141      2.029    0.56     0.58    
## Shelf2         2.531      2.101    1.20     0.23    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.96 on 55 degrees of freedom
## Multiple R-squared:  0.816,  Adjusted R-squared:  0.802 
## F-statistic: 60.8 on 4 and 55 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value Pr(>F)    
## Suga       1   6336    6336  178.35 <2e-16 ***
## Fibe       1   2251    2251   63.37  1e-10 ***
## Shelf1     1      0       0    0.01   0.93    
## Shelf2     1     52      52    1.45   0.23    
## Residuals 55   1954      36                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.706
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
## 
## 
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
##   Suga   Fibe Shelf1 Shelf2 
##  1.079  1.286  1.244  1.458

17 Removing Shelf1 and Running Regression

The resulting model shows high p-values for Shelf1 and Shelf2. Shelf1 will be removed first.

# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe + Shelf2, data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe + Shelf2, data = dftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.521  -3.507  -0.845   2.886  17.476 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   51.546      1.793   28.75  < 2e-16 ***
## Suga          -2.251      0.185  -12.17  < 2e-16 ***
## Fibe           2.761      0.351    7.87  1.3e-10 ***
## Shelf2         2.060      1.916    1.08     0.29    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.92 on 56 degrees of freedom
## Multiple R-squared:  0.814,  Adjusted R-squared:  0.805 
## F-statistic:   82 on 3 and 56 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Suga       1   6336    6336  180.56 < 2e-16 ***
## Fibe       1   2251    2251   64.15 7.5e-11 ***
## Shelf2     1     41      41    1.16    0.29    
## Residuals 56   1965      35                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.723
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
## 
## 
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
##   Suga   Fibe Shelf2 
##  1.073  1.166  1.227

18 Removing Shelf2 and Running Regression

Shelf2 shows a p-value of 0.29, i.e., it will be removed as well.

# Run regression with Shelf variable
lm <- lm(Rati ~ Suga + Fibe , data = dftrain)

# Add Residuals to Data Frame
dftrain$Resid  <- lm$residuals
dftrain$Fit    <- lm$fitted.values
MeanRes        <- mean(dftrain$Resid)
SDRes          <- sd(dftrain$Resid)
dftrain$StdRes <- (dftrain$Resid - MeanRes) / SDRes

# Display results 
summary(lm) 
## 
## Call:
## lm(formula = Rati ~ Suga + Fibe, data = dftrain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12.37  -4.09  -1.07   2.87  16.30 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.052      1.732    30.1  < 2e-16 ***
## Suga          -2.206      0.180   -12.2  < 2e-16 ***
## Fibe           2.626      0.328     8.0  6.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.93 on 57 degrees of freedom
## Multiple R-squared:  0.811,  Adjusted R-squared:  0.804 
## F-statistic:  122 on 2 and 57 DF,  p-value: <2e-16
anova(lm)
## Analysis of Variance Table
## 
## Response: Rati
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Suga       1   6336    6336     180 < 2e-16 ***
## Fibe       1   2251    2251      64 6.9e-11 ***
## Residuals 57   2006      35                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate RMSE
RMSE <- rmse(dftrain$Rati, dftrain$Fit)
cat("\n\nRMSE: ", RMSE)
## 
## 
## RMSE:  5.782
# Calculating VIF
vif_values <- vif(lm)
cat("\n\nVIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.\nThe VIF values are:\n")
## 
## 
## VIF must be lower than 10 and should be lower than 5 for a model that is not damaged by multicollinearity.
## The VIF values are:
vif_values
##  Suga  Fibe 
## 1.018 1.018

19 Checking Whether Residuals Are Normal

The final model shows all p-values to be significant and an R-squared adjusted of 80.8%. Now residual assumptions are to be checked.

# Display Normal Probability Plot of Residuals incl. 95% Confidence Interval
qqPlot(lm$residuals, col.lines = "pink", col = "darkblue", xlab = "Theoretical Quantiles", ylab = "Sample Quantiles") 

## [1] 19 56
# Test normality of residuals
cat("\n\nNormal distribution indicator of Residuals")
## 
## 
## Normal distribution indicator of Residuals
p.value <- shapiro.test(lm$residuals)$p.value
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
## 
## p-value:  0.04378
cat("\nIf p-value > 0.05, data are normally distributed.")
## 
## If p-value > 0.05, data are normally distributed.
if (p.value > 0.05) { 
        cat("\n\nThis means, residuals are normally distributed.")
} else {
        cat("\n\nThis means, residuals are not normally distributed.")
}
## 
## 
## This means, residuals are not normally distributed.

20 Plotting All Residual Plots

Produces diagnostic plots (residual vs. fitted, Q-Q plot, scale-location, residuals vs. leverage). The Q-Q plot checks the normality of residuals, and deviations indicate non-normality. All other plots must not show any pattern since patterns in these plots hint to a lurking variable that must be found and included.

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted
plot(lm$fitted.values, lm$residuals, main = "Residuals vs Fitted", xlab = "Fitted values",      ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(lm$residuals, main = "Normal Q-Q")
qqline(lm$residuals, col = "red")

# Residuals vs Time (ID)
plot(dftrain$ID, dftrain$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Observation Order (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs Leverage
plot(lm, which = 5, main = "Cook's Distance vs Leverage")

# Reset plotting environment
par(mfrow = c(1, 1))

21 Plotting All Residual Plots against Xs

Plots of residuals against Xs are to be checked for patterns. No patterns is good news.

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs X
plot(dftrain$Fibe, lm$residuals, main = "Residuals vs Fiber", xlab = "Fiber",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs X
plot(dftrain$Pota, lm$residuals, main = "Residuals vs Potassium", xlab = "Potassium",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs X
plot(dftrain$Shelf1, lm$residuals, main = "Residuals vs Shelf", xlab = "Shelf",      ylab = "Residuals")
abline(h = 0, col = "red")

# Residuals vs X
plot(dftrain$Suga, lm$residuals, main = "Residuals vs Sugar", xlab = "Sugar",      ylab = "Residuals")
abline(h = 0, col = "red")

# Reset plotting environment
par(mfrow = c(1, 1))

22 Testing Model with dftest Data

# Load library if not avalable
if(! "Metrics" %in% installed.packages()) { install.packages("Metrics", dependencies = TRUE) }
library(Metrics)

# Make predictions
predictTrain <- predict(lm, dftrain)
predictTest  <- predict(lm, dftest)

# Model performance
# Compute the prediction error, RMSE for dftrain
RMSE <- rmse(predictTrain, dftrain$Rati)
cat("\n\nThe average difference between the actual values for Rating and the predicted ones using the training data is ", round(RMSE, 2) , ".\n")
## 
## 
## The average difference between the actual values for Rating and the predicted ones using the training data is  5.78 .
# Compute the prediction error, RMSE for dftest
RMSE <- rmse(predictTest, dftest$Rati)
cat("\nThe average difference between the actual values for Rating and the predicted ones using the test data is ", round(RMSE, 2) , ".\n")
## 
## The average difference between the actual values for Rating and the predicted ones using the test data is  6.96 .
# Please also read: http://www.sthda.com/english/articles/40-regression-analysis/165-linear-regression-essentials-in-r/

23 Preparing Data for Pie Chart to Show Relative Importance of Factors

Preparing data for visualizing the sum of squares from the ANOVA table. Includes calculating percentages and cumulative sums for better presentation.

# Get the ANOVA table
anova_table <- anova(lm)

# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`

# Prepare data for the pie chart
SSdf <- data.frame(
  Component = rownames(anova_table),
  SumSq = SumSq
)

# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]

# Summarize the last three rows
# summary_row <- SSdf %>%
#   slice_tail(n = 2) %>%
#   summarise(Component = "Residual", SumSq = sum(SumSq))

# Drop the last three rows from the original data frame
# Rows <- nrow(SSdf) - 1
# SSdf <- SSdf[1:Rows, ]

# Add the summary row to the modified data frame
# SSdf <- bind_rows(SSdf, summary_row)

# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq) 

# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")

# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)

SSdf
##   Component SumSq Percent                   Labels
## 1      Suga  6336 59.81 %      Suga\n6336\n59.81 %
## 2      Fibe  2251 21.25 %      Fibe\n2251\n21.25 %
## 3 Residuals  2006 18.93 % Residuals\n2006\n18.93 %

24 Showing Pie Chart of Sum of Squares for Regression Model

Creating a pie chart to illustrate the contribution of each regression component to the total sum of squares. Enhances understanding of model variance attribution.

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Cereal Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")