1 Model Evaluation Techniques for Multiple Regression
2 Model Evaluation Techniques for Classification and Regression Trees
3 Model Evaluation Techniques for Classification and Regression Trees
4 Comparing Prediction Performance of Linear Model and CART Model

1 Model Evaluation Techniques for Multiple Regression

1.1 Load Libraries

Install and load required packages for model evaluation and visualisation. This ensures all necessary libraries are available before running the analysis. Packages include tools for data processing, visualization, and regression diagnostics.

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "pROC" %in% installed.packages()) { install.packages("pROC", dependencies = TRUE) }
library(pROC)
if(! "caret" %in% installed.packages()) { install.packages("caret", dependencies = TRUE) }
library(caret)
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)

# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

1.2 Get Data for Cereals

Load the dataset from an Excel file. Ensure the dataset is in the correct format and contains the necessary variables. The dataset should include predictor and response variables for regression analysis.

# Download Training Data from URL
# Download Test Data from URL

1.3 Explore Data for Cereals

Characteristics of the dataset are shown together with descriptive statistics.

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Add ID to Data Frame
Cereals$ID <- 1:nrow(Cereals)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Cereals)

##  [1] "name"     "mfr"      "type"     "calories" "protein"  "fat"     
##  [7] "sodium"   "fiber"    "carbo"    "sugars"   "potass"   "vitamins"
## [13] "shelf"    "weight"   "cups"     "rating"   "ID"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Cereals)

## 'data.frame':    77 obs. of  17 variables:
##  $ name    : Factor w/ 77 levels "100% Bran","100% Natural Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ mfr     : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
##  $ type    : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
##  $ calories: int  70 120 70 50 110 110 110 130 90 90 ...
##  $ protein : int  4 3 4 4 2 2 2 3 2 3 ...
##  $ fat     : int  1 5 1 0 2 2 0 2 1 0 ...
##  $ sodium  : int  130 15 260 140 200 180 125 210 200 210 ...
##  $ fiber   : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ carbo   : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ sugars  : int  6 8 5 0 8 10 14 8 6 5 ...
##  $ potass  : int  280 135 320 330 -1 70 30 100 125 190 ...
##  $ vitamins: int  25 0 25 25 25 25 25 25 25 25 ...
##  $ shelf   : int  3 3 3 3 3 1 2 3 1 3 ...
##  $ weight  : num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ cups    : num  0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
##  $ rating  : num  68.4 34 59.4 93.7 34.4 ...
##  $ ID      : int  1 2 3 4 5 6 7 8 9 10 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Cereals, 5)

##                        name mfr type calories protein fat sodium fiber carbo
## 1                 100% Bran   N    C       70       4   1    130    10     5
## 2         100% Natural Bran   Q    C      120       3   5     15     2     8
## 3                  All-Bran   K    C       70       4   1    260     9     7
## 4 All-Bran with Extra Fiber   K    C       50       4   0    140    14     8
## 5            Almond Delight   R    C      110       2   2    200     1    14
##   sugars potass vitamins shelf weight cups rating ID
## 1      6    280       25     3      1 0.33  68.40  1
## 2      8    135        0     3      1 1.00  33.98  2
## 3      5    320       25     3      1 0.33  59.43  3
## 4      0    330       25     3      1 0.50  93.70  4
## 5      8     -1       25     3      1 0.75  34.38  5

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Cereals[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##    Variable  N    Mean      SD    Min    P25  P50    P75    Max
## 1       mfr 77                                                 
## 2     ... A  1  1.299%                                         
## 3     ... G 22 28.571%                                         
## 4     ... K 23  29.87%                                         
## 5     ... N  6  7.792%                                         
## 6     ... P  9 11.688%                                         
## 7     ... Q  8  10.39%                                         
## 8     ... R  8  10.39%                                         
## 9      type 77                                                 
## 10    ... C 74 96.104%                                         
## 11    ... H  3  3.896%                                         
## 12 calories 77  106.88  19.484     50    100  110    110    160
## 13  protein 77  2.5455  1.0948      1      2    3      3      6
## 14      fat 77   1.013  1.0065      0      0    1      2      5
## 15   sodium 77  159.68  83.832      0    130  180    210    320
## 16    fiber 77  2.1519  2.3834      0      1    2      3     14
## 17    carbo 77  14.597   4.279     -1     12   14     17     23
## 18   sugars 76  7.0263  4.3787      0      3    7     11     15
## 19   potass 77  96.078  71.287     -1     40   90    120    330
## 20 vitamins 77  28.247  22.343      0     25   25     25    100
## 21    shelf 77  2.2078 0.83252      1      1    2      3      3
## 22   weight 77  1.0296 0.15048    0.5      1    1      1    1.5
## 23     cups 77 0.82104 0.23272   0.25   0.67 0.75      1    1.5
## 24   rating 77  42.666  14.047 18.043 33.174 40.4 50.828 93.705
## 25       ID 77      39  22.372      1     20   39     58     77
##                   NA Mode
## 1                        
## 2                        
## 3                        
## 4                        
## 5                        
## 6                        
## 7                        
## 8                        
## 9                        
## 10                       
## 11                       
## 12                 0     
## 13                 0     
## 14                 0     
## 15                 0     
## 16                 0     
## 17                 0     
## 18 0.012987012987013     
## 19                 0     
## 20                 0     
## 21                 0     
## 22                 0     
## 23                 0     
## 24                 0     
## 25                 0

# Show number of Blanks
cat("\n\nNumber of Blanks per Column:\n")

## 
## 
## Number of Blanks per Column:

sapply(Cereals, function(x) sum(x == "" | x == "NA"))

##     name      mfr     type calories  protein      fat   sodium    fiber 
##        0        0        0        0        0        0        0        0 
##    carbo   sugars   potass vitamins    shelf   weight     cups   rating 
##        0       NA        0        0        0        0        0        0 
##       ID 
##        0

1.4 Clean Data for Cereals

Check for missing values and handle them appropriately. Missing values can impact model performance, so we need to decide whether to impute or remove them. Ensure that categorical variables are converted to factors for proper modeling.

# Define the function to replace NAs with the mean
NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))

# Apply the function across all columns, grouped by CategoryColumn
Cereals <- Cereals %>%
#  group_by(Country) %>%
  mutate(across(everything(), NA2mean))

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Cereals[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##    Variable  N    Mean      SD    Min    P25  P50    P75    Max NA Mode
## 1       mfr 77                                                         
## 2     ... A  1  1.299%                                                 
## 3     ... G 22 28.571%                                                 
## 4     ... K 23  29.87%                                                 
## 5     ... N  6  7.792%                                                 
## 6     ... P  9 11.688%                                                 
## 7     ... Q  8  10.39%                                                 
## 8     ... R  8  10.39%                                                 
## 9      type 77                                                         
## 10    ... C 74 96.104%                                                 
## 11    ... H  3  3.896%                                                 
## 12 calories 77  106.88  19.484     50    100  110    110    160  0     
## 13  protein 77  2.5455  1.0948      1      2    3      3      6  0     
## 14      fat 77   1.013  1.0065      0      0    1      2      5  0     
## 15   sodium 77  159.68  83.832      0    130  180    210    320  0     
## 16    fiber 77  2.1519  2.3834      0      1    2      3     14  0     
## 17    carbo 77  14.597   4.279     -1     12   14     17     23  0     
## 18   sugars 77  7.0263  4.3498      0      3    7     11     15  0     
## 19   potass 77  96.078  71.287     -1     40   90    120    330  0     
## 20 vitamins 77  28.247  22.343      0     25   25     25    100  0     
## 21    shelf 77  2.2078 0.83252      1      1    2      3      3  0     
## 22   weight 77  1.0296 0.15048    0.5      1    1      1    1.5  0     
## 23     cups 77 0.82104 0.23272   0.25   0.67 0.75      1    1.5  0     
## 24   rating 77  42.666  14.047 18.043 33.174 40.4 50.828 93.705  0     
## 25       ID 77      39  22.372      1     20   39     58     77  0

# Write Data to WD
write.csv(Cereals, file = "CerealsClean.csv")

1.5 Make Indicator Variables for Mfr and Type

# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE, INSTALL_opts = '--no-lock') }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE, INSTALL_opts = '--no-lock') }
library(data.table)

# Convert categorical variables to factors
Cereals <- one_hot(as.data.table(Cereals[, -1]), )

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Cereals, 5)

##    mfr_A mfr_G mfr_K mfr_N mfr_P mfr_Q mfr_R type_C type_H calories protein
##    <int> <int> <int> <int> <int> <int> <int>  <int>  <int>    <num>   <num>
## 1:     0     0     0     1     0     0     0      1      0       70       4
## 2:     0     0     0     0     0     1     0      1      0      120       3
## 3:     0     0     1     0     0     0     0      1      0       70       4
## 4:     0     0     1     0     0     0     0      1      0       50       4
## 5:     0     0     0     0     0     0     1      1      0      110       2
##      fat sodium fiber carbo sugars potass vitamins shelf weight  cups rating
##    <num>  <num> <num> <num>  <num>  <num>    <num> <num>  <num> <num>  <num>
## 1:     1    130    10     5      6    280       25     3      1  0.33  68.40
## 2:     5     15     2     8      8    135        0     3      1  1.00  33.98
## 3:     1    260     9     7      5    320       25     3      1  0.33  59.43
## 4:     0    140    14     8      0    330       25     3      1  0.50  93.70
## 5:     2    200     1    14      8     -1       25     3      1  0.75  34.38
##       ID
##    <num>
## 1:     1
## 2:     2
## 3:     3
## 4:     4
## 5:     5

cat("\n\nList of Columns (Variables):\n")

## 
## 
## List of Columns (Variables):

names(Cereals)

##  [1] "mfr_A"    "mfr_G"    "mfr_K"    "mfr_N"    "mfr_P"    "mfr_Q"   
##  [7] "mfr_R"    "type_C"   "type_H"   "calories" "protein"  "fat"     
## [13] "sodium"   "fiber"    "carbo"    "sugars"   "potass"   "vitamins"
## [19] "shelf"    "weight"   "cups"     "rating"   "ID"

# Write Data to WD
write.csv(Cereals, file = "CerealsClean.csv")

1.6 Prepare Data

# Loading other packages if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Convert X and Y in Numeric Columns

# Make Column Names short
Cereals        <- data.frame(Cereals)
names(Cereals) <- str_to_title(names(Cereals))
names(Cereals) <- substr(names(Cereals), 1, 9)

# Split data into train and test
set.seed(123)
split <- initial_split(Cereals, prop = 0.80)
CerealsTrain <- training(split)
CerealsTest  <- testing(split)

# Select numeric columns for correlation matrix
CerealNum  <- data.frame(select_if(CerealsTrain[, -23], is.numeric))

# Show Split Data
cat("\nShow Structure of the Training Data Frame:\n")

## 
## Show Structure of the Training Data Frame:

cat("Training Data Frame includes ", nrow(CerealsTrain), " rows and ", ncol(CerealsTrain), " columns.")

## Training Data Frame includes  61  rows and  23  columns.

cat("\n\nShow Structure of the Testing Data Frame:\n")

## 
## 
## Show Structure of the Testing Data Frame:

cat("Testing Data Frame includes ", nrow(CerealsTest), " rows and ", ncol(CerealsTest), " columns.")

## Testing Data Frame includes  16  rows and  23  columns.

cat("\n\nShow Columns of the Training Data Frame:\n")

## 
## 
## Show Columns of the Training Data Frame:

names(Cereals)

##  [1] "Mfr_a"    "Mfr_g"    "Mfr_k"    "Mfr_n"    "Mfr_p"    "Mfr_q"   
##  [7] "Mfr_r"    "Type_c"   "Type_h"   "Calories" "Protein"  "Fat"     
## [13] "Sodium"   "Fiber"    "Carbo"    "Sugars"   "Potass"   "Vitamins"
## [19] "Shelf"    "Weight"   "Cups"     "Rating"   "Id"

1.7 Compute Correlation Matrix

# Plot Correlation Matrix of Numeric Values
corrMat <- cor(CerealNum)

# Display results 
round(corrMat, 2)

##          Mfr_a Mfr_g Mfr_k Mfr_n Mfr_p Mfr_q Mfr_r Type_c Type_h Calories
## Mfr_a     1.00 -0.09 -0.08 -0.04 -0.05 -0.04 -0.04  -0.70   0.70    -0.05
## Mfr_g    -0.09  1.00 -0.40 -0.20 -0.26 -0.22 -0.22   0.12  -0.12     0.23
## Mfr_k    -0.08 -0.40  1.00 -0.18 -0.23 -0.20 -0.20   0.11  -0.11     0.11
## Mfr_n    -0.04 -0.20 -0.18  1.00 -0.12 -0.10 -0.10  -0.28   0.28    -0.37
## Mfr_p    -0.05 -0.26 -0.23 -0.12  1.00 -0.13 -0.13   0.07  -0.07     0.07
## Mfr_q    -0.04 -0.22 -0.20 -0.10 -0.13  1.00 -0.11   0.06  -0.06    -0.33
## Mfr_r    -0.04 -0.22 -0.20 -0.10 -0.13 -0.11  1.00   0.06  -0.06     0.09
## Type_c   -0.70  0.12  0.11 -0.28  0.07  0.06  0.06   1.00  -1.00     0.08
## Type_h    0.70 -0.12 -0.11  0.28 -0.07 -0.06 -0.06  -1.00   1.00    -0.08
## Calories -0.05  0.23  0.11 -0.37  0.07 -0.33  0.09   0.08  -0.08     1.00
## Protein   0.19 -0.09  0.05  0.11 -0.01 -0.07 -0.02  -0.18   0.18     0.07
## Fat       0.01  0.31 -0.26 -0.25 -0.03  0.08  0.08   0.09  -0.09     0.51
## Sodium   -0.26  0.31  0.10 -0.44 -0.09 -0.18  0.17   0.28  -0.28     0.42
## Fiber    -0.12 -0.23 -0.03  0.38  0.20 -0.14  0.01   0.12  -0.12    -0.10
## Carbo     0.04  0.00  0.09  0.03 -0.19 -0.23  0.27  -0.18   0.18     0.16
## Sugars   -0.12  0.16  0.09 -0.33  0.18 -0.10 -0.12   0.23  -0.23     0.52
## Potass    0.02 -0.01 -0.15  0.18  0.21 -0.14 -0.03   0.11  -0.11     0.11
## Vitamins -0.02  0.17  0.19 -0.26 -0.05 -0.18 -0.05   0.14  -0.14     0.31
## Shelf    -0.02 -0.01  0.01 -0.12  0.12  0.16 -0.18   0.03  -0.03     0.11
## Weight   -0.02  0.17  0.09 -0.11  0.14 -0.42 -0.04   0.02  -0.02     0.76
## Cups      0.09  0.15  0.05 -0.06 -0.25 -0.03  0.04  -0.12   0.12    -0.06
## Rating    0.14 -0.40 -0.02  0.59 -0.02  0.05  0.03  -0.26   0.26    -0.64
##          Protein   Fat Sodium Fiber Carbo Sugars Potass Vitamins Shelf Weight
## Mfr_a       0.19  0.01  -0.26 -0.12  0.04  -0.12   0.02    -0.02 -0.02  -0.02
## Mfr_g      -0.09  0.31   0.31 -0.23  0.00   0.16  -0.01     0.17 -0.01   0.17
## Mfr_k       0.05 -0.26   0.10 -0.03  0.09   0.09  -0.15     0.19  0.01   0.09
## Mfr_n       0.11 -0.25  -0.44  0.38  0.03  -0.33   0.18    -0.26 -0.12  -0.11
## Mfr_p      -0.01 -0.03  -0.09  0.20 -0.19   0.18   0.21    -0.05  0.12   0.14
## Mfr_q      -0.07  0.08  -0.18 -0.14 -0.23  -0.10  -0.14    -0.18  0.16  -0.42
## Mfr_r      -0.02  0.08   0.17  0.01  0.27  -0.12  -0.03    -0.05 -0.18  -0.04
## Type_c     -0.18  0.09   0.28  0.12 -0.18   0.23   0.11     0.14  0.03   0.02
## Type_h      0.18 -0.09  -0.28 -0.12  0.18  -0.23  -0.11    -0.14 -0.03  -0.02
## Calories    0.07  0.51   0.42 -0.10  0.16   0.52   0.11     0.31  0.11   0.76
## Protein     1.00  0.15   0.01  0.48  0.07  -0.32   0.52     0.04  0.13   0.24
## Fat         0.15  1.00   0.12  0.09 -0.26   0.25   0.23     0.05  0.30   0.26
## Sodium      0.01  0.12   1.00 -0.13  0.38   0.03  -0.05     0.34 -0.14   0.37
## Fiber       0.48  0.09  -0.13  1.00 -0.29  -0.05   0.89    -0.04  0.30   0.34
## Carbo       0.07 -0.26   0.38 -0.29  1.00  -0.60  -0.27     0.20 -0.18   0.11
## Sugars     -0.32  0.25   0.03 -0.05 -0.60   1.00   0.08     0.13  0.09   0.43
## Potass      0.52  0.23  -0.05  0.89 -0.27   0.08   1.00     0.07  0.36   0.50
## Vitamins    0.04  0.05   0.34 -0.04  0.20   0.13   0.07     1.00  0.28   0.35
## Shelf       0.13  0.30  -0.14  0.30 -0.18   0.09   0.36     0.28  1.00   0.19
## Weight      0.24  0.26   0.37  0.34  0.11   0.43   0.50     0.35  0.19   1.00
## Cups       -0.21 -0.33   0.11 -0.50  0.36  -0.10  -0.45     0.12 -0.39  -0.21
## Rating      0.49 -0.40  -0.45  0.48  0.22  -0.76   0.28    -0.28  0.01  -0.32
##           Cups Rating
## Mfr_a     0.09   0.14
## Mfr_g     0.15  -0.40
## Mfr_k     0.05  -0.02
## Mfr_n    -0.06   0.59
## Mfr_p    -0.25  -0.02
## Mfr_q    -0.03   0.05
## Mfr_r     0.04   0.03
## Type_c   -0.12  -0.26
## Type_h    0.12   0.26
## Calories -0.06  -0.64
## Protein  -0.21   0.49
## Fat      -0.33  -0.40
## Sodium    0.11  -0.45
## Fiber    -0.50   0.48
## Carbo     0.36   0.22
## Sugars   -0.10  -0.76
## Potass   -0.45   0.28
## Vitamins  0.12  -0.28
## Shelf    -0.39   0.01
## Weight   -0.21  -0.32
## Cups      1.00  -0.09
## Rating   -0.09   1.00

1.8 Show Correlation Matrix

# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)

cat("\n\nShow Correlation Plot of Cereal Data:\n")

## 
## 
## Show Correlation Plot of Cereal Data:

# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

1.9 Perform Regression 1 to Train Model

Create a multiple regression model using the lm() function. The response variable should be continuous, and predictors should be selected based on domain knowledge. Check for assumptions of regression, including linearity and multicollinearity.

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Run Linear Regression with all Xs
LinReg01 <- lm(Rating ~ Mfr_g + Mfr_k + Mfr_n + Mfr_p + Mfr_q + Mfr_r + Calories + Protein + Fat + Sodium + Fiber + Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)

summary(LinReg01)

## 
## Call:
## lm(formula = Rating ~ Mfr_g + Mfr_k + Mfr_n + Mfr_p + Mfr_q + 
##     Mfr_r + Calories + Protein + Fat + Sodium + Fiber + Carbo + 
##     Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.0000005432 -0.0000002190 -0.0000000067  0.0000001784  0.0000005201 
## 
## Coefficients:
##                    Estimate      Std. Error      t value            Pr(>|t|)
## (Intercept) 54.927184076586  0.000000574968  95530922.79 <0.0000000000000002
## Mfr_g        0.000000070142  0.000000393572         0.18                0.86
## Mfr_k       -0.000000019625  0.000000436983        -0.04                0.96
## Mfr_n        0.000000000697  0.000000462933         0.00                1.00
## Mfr_p        0.000000258209  0.000000424684         0.61                0.55
## Mfr_q        0.000000038642  0.000000405081         0.10                0.92
## Mfr_r       -0.000000106761  0.000000468450        -0.23                0.82
## Calories    -0.222724168805  0.000000008860 -25138927.43 <0.0000000000000002
## Protein      3.273173867642  0.000000060806  53829736.69 <0.0000000000000002
## Fat         -1.691407935680  0.000000097071 -17424498.05 <0.0000000000000002
## Sodium      -0.054492702474  0.000000000760 -71658380.96 <0.0000000000000002
## Fiber        3.443479867529  0.000000095657  35998084.37 <0.0000000000000002
## Carbo        1.092450961999  0.000000047682  22911291.53 <0.0000000000000002
## Sugars      -0.724895126352  0.000000043386 -16708145.91 <0.0000000000000002
## Potass      -0.033993352893  0.000000002332 -14578389.20 <0.0000000000000002
## Vitamins    -0.051211966321  0.000000002392 -21407941.11 <0.0000000000000002
## Shelf       -0.000000102398  0.000000065294        -1.57                0.12
## Weight      -0.000000102594  0.000000846602        -0.12                0.90
## Cups         0.000000246336  0.000000238248         1.03                0.31
##                
## (Intercept) ***
## Mfr_g          
## Mfr_k          
## Mfr_n          
## Mfr_p          
## Mfr_q          
## Mfr_r          
## Calories    ***
## Protein     ***
## Fat         ***
## Sodium      ***
## Fiber       ***
## Carbo       ***
## Sugars      ***
## Potass      ***
## Vitamins    ***
## Shelf          
## Weight         
## Cups           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000000304 on 42 degrees of freedom
## Multiple R-squared:     1,   Adjusted R-squared:     1 
## F-statistic: 6.22e+15 on 18 and 42 DF,  p-value: <0.0000000000000002

anova(LinReg01)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq              F value              Pr(>F)    
## Mfr_g      1   1644    1644 17826911792018684.00 <0.0000000000000002 ***
## Mfr_k      1    408     408  4424468649701570.00 <0.0000000000000002 ***
## Mfr_n      1   2443    2443 26496108344241260.00 <0.0000000000000002 ***
## Mfr_p      1     51      51   558420728560620.50 <0.0000000000000002 ***
## Mfr_q      1      3       3    36273542372380.66 <0.0000000000000002 ***
## Mfr_r      1    135     135  1461432841867144.75 <0.0000000000000002 ***
## Calories   1   1806    1806 19590988157889628.00 <0.0000000000000002 ***
## Protein    1   2188    2188 23728118829838116.00 <0.0000000000000002 ***
## Fat        1     58      58   632887772742377.62 <0.0000000000000002 ***
## Sodium     1    100     100  1085712742772386.25 <0.0000000000000002 ***
## Fiber      1     43      43   468880954465192.94 <0.0000000000000002 ***
## Carbo      1   1291    1291 14002347678624214.00 <0.0000000000000002 ***
## Sugars     1     71      71   771674488825328.00 <0.0000000000000002 ***
## Potass     1     26      26   276754550243907.62 <0.0000000000000002 ***
## Vitamins   1     49      49   534286553581679.00 <0.0000000000000002 ***
## Shelf      1      0       0                 3.64               0.063 .  
## Weight     1      0       0                 0.16               0.691    
## Cups       1      0       0                 1.07               0.307    
## Residuals 42      0       0                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.10 Explore Multicollinearity

Perform diagnostic checks to evaluate model assumptions. Check for normality of residuals, homoscedasticity, and multicollinearity. Use visualizations and statistical tests to validate assumptions

# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Calculating VIF
vif_values <- vif(LinReg01)
cat("\n\nShow List of VIF for all Factors:\n")

## 
## 
## Show List of VIF for all Factors:

vif_values

##    Mfr_g    Mfr_k    Mfr_n    Mfr_p    Mfr_q    Mfr_r Calories  Protein 
##   21.976   24.443   10.668   13.595    9.627   12.875   14.734    2.997 
##      Fat   Sodium    Fiber    Carbo   Sugars   Potass Vitamins    Shelf 
##    4.992    2.588   21.808   21.219   24.680   13.623    1.559    1.858 
##   Weight     Cups 
##   10.243    2.039

names(CerealNum)

##  [1] "Mfr_a"    "Mfr_g"    "Mfr_k"    "Mfr_n"    "Mfr_p"    "Mfr_q"   
##  [7] "Mfr_r"    "Type_c"   "Type_h"   "Calories" "Protein"  "Fat"     
## [13] "Sodium"   "Fiber"    "Carbo"    "Sugars"   "Potass"   "Vitamins"
## [19] "Shelf"    "Weight"   "Cups"     "Rating"

1.11 Show Correlation Matrix

# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)

# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(1:9)]

# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

1.12 Perform Regression 2 to Train Model

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

LinReg02 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Fiber + Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)

summary(LinReg02)

## 
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Fiber + 
##     Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups, 
##     data = CerealsTrain)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.0000005394 -0.0000001912  0.0000000198  0.0000001762  0.0000005301 
## 
## Coefficients:
##                    Estimate      Std. Error      t value            Pr(>|t|)
## (Intercept) 54.927184197908  0.000000423301 129759058.33 <0.0000000000000002
## Calories    -0.222724166150  0.000000008204 -27147569.85 <0.0000000000000002
## Protein      3.273173839209  0.000000057083  57340144.61 <0.0000000000000002
## Fat         -1.691407979275  0.000000083479 -20261427.86 <0.0000000000000002
## Sodium      -0.054492702426  0.000000000584 -93246016.59 <0.0000000000000002
## Fiber        3.443479823430  0.000000057110  60296059.78 <0.0000000000000002
## Carbo        1.092450932970  0.000000040634  26885005.95 <0.0000000000000002
## Sugars      -0.724895147880  0.000000037884 -19134619.22 <0.0000000000000002
## Potass      -0.033993351852  0.000000001729 -19655615.39 <0.0000000000000002
## Vitamins    -0.051211966647  0.000000002320 -22074483.90 <0.0000000000000002
## Shelf       -0.000000088573  0.000000059633        -1.49                0.14
## Weight       0.000000206905  0.000000686180         0.30                0.76
## Cups         0.000000224078  0.000000228385         0.98                0.33
##                
## (Intercept) ***
## Calories    ***
## Protein     ***
## Fat         ***
## Sodium      ***
## Fiber       ***
## Carbo       ***
## Sugars      ***
## Potass      ***
## Vitamins    ***
## Shelf          
## Weight         
## Cups           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0000003 on 48 degrees of freedom
## Multiple R-squared:     1,   Adjusted R-squared:     1 
## F-statistic: 9.56e+15 on 12 and 48 DF,  p-value: <0.0000000000000002

anova(LinReg02)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq              F value              Pr(>F)    
## Calories   1   4235    4235 47084416123530368.00 <0.0000000000000002 ***
## Protein    1   2998    2998 33326560615337632.00 <0.0000000000000002 ***
## Fat        1    271     271  3008445072310896.50 <0.0000000000000002 ***
## Sodium     1    455     455  5053205036681088.00 <0.0000000000000002 ***
## Fiber      1    325     325  3611960742822140.50 <0.0000000000000002 ***
## Carbo      1   1845    1845 20508484998727288.00 <0.0000000000000002 ***
## Sugars     1     89      89   984525405604054.75 <0.0000000000000002 ***
## Potass     1     50      50   550839227939786.75 <0.0000000000000002 ***
## Vitamins   1     53      53   584443708100659.00 <0.0000000000000002 ***
## Shelf      1      0       0                 3.37               0.073 .  
## Weight     1      0       0                 0.01               0.904    
## Cups       1      0       0                 0.96               0.331    
## Residuals 48      0       0                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.13 Explore Multicollinearity

# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Calculating VIF
vif_values <- vif(LinReg02)
cat("\n\nShow List of VIF for Factors:\n")

## 
## 
## Show List of VIF for Factors:

vif_values

## Calories  Protein      Fat   Sodium    Fiber    Carbo   Sugars   Potass 
##   12.952    2.708    3.785    1.567    7.969   15.798   19.291    7.683 
## Vitamins    Shelf   Weight     Cups 
##    1.503    1.589    6.898    1.921

1.14 Show Correlation Matrix

# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)

# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(6)]

# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

1.15 Perform Regression 3 to Train Model

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

LinReg03 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Fiber + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)

summary(LinReg03)

## 
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Fiber + 
##     Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0964 -0.6892 -0.0974  0.5579  2.9248 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 55.49816    1.62373   34.18 < 0.0000000000000002 ***
## Calories    -0.06443    0.02194   -2.94               0.0050 ** 
## Protein      2.42271    0.18250   13.28 < 0.0000000000000002 ***
## Fat         -3.18399    0.23944  -13.30 < 0.0000000000000002 ***
## Sodium      -0.05253    0.00223  -23.59 < 0.0000000000000002 ***
## Fiber        3.04367    0.21177   14.37 < 0.0000000000000002 ***
## Sugars      -1.68882    0.04699  -35.94 < 0.0000000000000002 ***
## Potass      -0.03305    0.00664   -4.98            0.0000084 ***
## Vitamins    -0.04736    0.00889   -5.33            0.0000025 ***
## Shelf        0.04574    0.22894    0.20               0.8425    
## Weight       7.42864    2.41231    3.08               0.0034 ** 
## Cups         1.91401    0.83345    2.30               0.0260 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.15 on 49 degrees of freedom
## Multiple R-squared:  0.994,  Adjusted R-squared:  0.992 
## F-statistic:  703 on 11 and 49 DF,  p-value: <0.0000000000000002

anova(LinReg03)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq F value               Pr(>F)    
## Calories   1   4235    4235 3191.93 < 0.0000000000000002 ***
## Protein    1   2998    2998 2259.26 < 0.0000000000000002 ***
## Fat        1    271     271  203.95 < 0.0000000000000002 ***
## Sodium     1    455     455  342.56 < 0.0000000000000002 ***
## Fiber      1    325     325  244.86 < 0.0000000000000002 ***
## Sugars     1   1884    1884 1419.89 < 0.0000000000000002 ***
## Potass     1     33      33   24.74            0.0000085 ***
## Vitamins   1     35      35   26.33            0.0000049 ***
## Shelf      1      0       0    0.21               0.6479    
## Weight     1     11      11    8.57               0.0052 ** 
## Cups       1      7       7    5.27               0.0260 *  
## Residuals 49     65       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.16 Explore Multicollinearity

# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Calculating VIF
vif_values <- vif(LinReg03)
cat("\n\nShow List of VIF for Factors:\n")

## 
## 
## Show List of VIF for Factors:

vif_values

## Calories  Protein      Fat   Sodium    Fiber   Sugars   Potass Vitamins 
##    6.282    1.877    2.111    1.542    7.428    2.012    7.679    1.498 
##    Shelf   Weight     Cups 
##    1.588    5.780    1.734

1.17 Show Correlation Matrix

# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)

# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(5)]

# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
 upper = "pie",
 lower = "number",
 addgrid.col = "black",
 tl.col = "black")

1.18 Perform Regression 4 to Train Model

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

LinReg04 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)

summary(LinReg04)

## 
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars + 
##     Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.477 -1.436 -0.029  1.526  5.305 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 62.27041    3.51296   17.73 < 0.0000000000000002 ***
## Calories    -0.15150    0.04768   -3.18               0.0025 ** 
## Protein      2.50371    0.41240    6.07     0.00000016968177 ***
## Fat         -3.39904    0.54027   -6.29     0.00000007698998 ***
## Sodium      -0.05350    0.00503  -10.63     0.00000000000002 ***
## Sugars      -1.74649    0.10585  -16.50 < 0.0000000000000002 ***
## Potass       0.04137    0.00940    4.40     0.00005670600108 ***
## Vitamins    -0.05618    0.02006   -2.80               0.0072 ** 
## Shelf       -0.17109    0.51646   -0.33               0.7418    
## Weight      13.52251    5.36884    2.52               0.0150 *  
## Cups        -1.87535    1.78750   -1.05               0.2992    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.6 on 50 degrees of freedom
## Multiple R-squared:  0.967,  Adjusted R-squared:  0.961 
## F-statistic:  147 on 10 and 50 DF,  p-value: <0.0000000000000002

anova(LinReg04)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq F value               Pr(>F)    
## Calories   1   4235    4235  624.49 < 0.0000000000000002 ***
## Protein    1   2998    2998  442.02 < 0.0000000000000002 ***
## Fat        1    271     271   39.90       0.000000070271 ***
## Sodium     1    455     455   67.02       0.000000000086 ***
## Sugars     1   1415    1415  208.73 < 0.0000000000000002 ***
## Potass     1    483     483   71.24       0.000000000035 ***
## Vitamins   1     66      66    9.77               0.0030 ** 
## Shelf      1      0       0    0.00               0.9972    
## Weight     1     49      49    7.20               0.0099 ** 
## Cups       1      7       7    1.10               0.2992    
## Residuals 50    339       7                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.19 Explore Multicollinearity

# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Calculating VIF
vif_values <- vif(LinReg04)
cat("\n\nShow List of VIF for Factors:\n")

## 
## 
## Show List of VIF for Factors:

vif_values

## Calories  Protein      Fat   Sodium   Sugars   Potass Vitamins    Shelf 
##    5.803    1.875    2.103    1.541    1.998    3.011    1.490    1.581 
##   Weight     Cups 
##    5.601    1.561

1.20 Perform Regression 5 to Train Model

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Run Regression
LinReg05 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins + Weight, data = CerealsTrain)

summary(LinReg05)

## 
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars + 
##     Potass + Vitamins + Weight, data = CerealsTrain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.789 -1.587 -0.068  1.385  5.272 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 60.17314    2.71925   22.13 < 0.0000000000000002 ***
## Calories    -0.15872    0.04675   -3.40               0.0013 ** 
## Protein      2.51268    0.40689    6.18   0.0000001016848212 ***
## Fat         -3.27124    0.50378   -6.49   0.0000000317633263 ***
## Sodium      -0.05339    0.00483  -11.06   0.0000000000000029 ***
## Sugars      -1.74005    0.10462  -16.63 < 0.0000000000000002 ***
## Potass       0.04264    0.00901    4.73   0.0000174863141909 ***
## Vitamins    -0.06168    0.01799   -3.43               0.0012 ** 
## Weight      14.27110    5.27415    2.71               0.0092 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.58 on 52 degrees of freedom
## Multiple R-squared:  0.966,  Adjusted R-squared:  0.961 
## F-statistic:  187 on 8 and 52 DF,  p-value: <0.0000000000000002

anova(LinReg05)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq F value               Pr(>F)    
## Calories   1   4235    4235  635.47 < 0.0000000000000002 ***
## Protein    1   2998    2998  449.79 < 0.0000000000000002 ***
## Fat        1    271     271   40.60        0.00000004954 ***
## Sodium     1    455     455   68.20        0.00000000005 ***
## Sugars     1   1415    1415  212.40 < 0.0000000000000002 ***
## Potass     1    483     483   72.49        0.00000000002 ***
## Vitamins   1     66      66    9.94               0.0027 ** 
## Weight     1     49      49    7.32               0.0092 ** 
## Residuals 52    347       7                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Calculating VIF
vif_values <- vif(LinReg05)
cat("\n\nShow List of VIF for Factors:\n")

## 
## 
## Show List of VIF for Factors:

vif_values

## Calories  Protein      Fat   Sodium   Sugars   Potass Vitamins   Weight 
##    5.675    1.857    1.860    1.443    1.986    2.816    1.220    5.500

1.21 Perform Regression 6 to Train Model

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

LinReg06 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins, data = CerealsTrain)

summary(LinReg06)

## 
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars + 
##     Potass + Vitamins, data = CerealsTrain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.040 -2.050 -0.098  1.609  6.485 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 63.13419    2.63362   23.97 < 0.0000000000000002 ***
## Calories    -0.06406    0.03280   -1.95               0.0561 .  
## Protein      2.50596    0.43047    5.82    0.000000347773263 ***
## Fat         -3.90541    0.47179   -8.28    0.000000000040370 ***
## Sodium      -0.05111    0.00503  -10.16    0.000000000000048 ***
## Sugars      -1.71224    0.11015  -15.54 < 0.0000000000000002 ***
## Potass       0.05885    0.00712    8.26    0.000000000043202 ***
## Vitamins    -0.05638    0.01892   -2.98               0.0043 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.73 on 53 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.957 
## F-statistic:  190 on 7 and 53 DF,  p-value: <0.0000000000000002

anova(LinReg06)

## Analysis of Variance Table
## 
## Response: Rating
##           Df Sum Sq Mean Sq F value               Pr(>F)    
## Calories   1   4235    4235  567.75 < 0.0000000000000002 ***
## Protein    1   2998    2998  401.86 < 0.0000000000000002 ***
## Fat        1    271     271   36.28       0.000000166435 ***
## Sodium     1    455     455   60.93       0.000000000229 ***
## Sugars     1   1415    1415  189.76 < 0.0000000000000002 ***
## Potass     1    483     483   64.76       0.000000000094 ***
## Vitamins   1     66      66    8.88               0.0043 ** 
## Residuals 53    395       7                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Calculating VIF
vif_values <- vif(LinReg06)
vif_values

## Calories  Protein      Fat   Sodium   Sugars   Potass Vitamins 
##    2.496    1.857    1.458    1.400    1.967    1.572    1.205

1.22 Prepare Data for Pie Chart

# Get the ANOVA table
anova_table <- anova(LinReg06)

# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`

# Prepare data for the pie chart
SSdf <- data.frame(
  Component = rownames(anova_table),
  SumSq = SumSq
)

# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]

# Summarize the last three rows
summary_row <- SSdf %>%
slice_tail(n = 2) %>%
summarise(Component = "Fat+Vitamin", SumSq = sum(SumSq))

# Drop the last three rows from the original data frame
Rows <- nrow(SSdf) - 2
SSdf <- SSdf[1:Rows, ]

# Add the summary row to the modified data frame
SSdf <- bind_rows(SSdf, summary_row)

# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq) 

# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")

# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)

SSdf

##     Component  SumSq Percent                   Labels
## 1    Calories 4235.0 41.05 %  Calories\n4235\n41.05 %
## 2     Protein 2997.6 29.05 %   Protein\n2998\n29.05 %
## 3      Sugars 1415.5 13.72 %    Sugars\n1415\n13.72 %
## 4      Potass  483.1  4.68 %      Potass\n483\n4.68 %
## 5      Sodium  454.5  4.41 %      Sodium\n455\n4.41 %
## 6   Residuals  395.3  3.83 %   Residuals\n395\n3.83 %
## 7 Fat+Vitamin  336.9  3.26 % Fat+Vitamin\n337\n3.26 %

1.23 Show Pie Chart of Sum of Squares for Regression Model

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Cereal Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

1.24 Show Column Chart of Sum of Squares for Regression Model

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the pie chart
ggplot(SSdf, aes(x = reorder(Component, -SumSq), y = SumSq, label = Labels, fill = reorder(Component, -SumSq), desc = TRUE)) +
  geom_col(stat = "identity", fill = Colours) +
  geom_text(aes(
            label = Labels,
            colour = "black",
            position = position_dodge(width = 1)) +  
  scale_fill_manual(values = Colours))  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Cereal Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

1.25 Show Pareto Chart of Sum of Squares for Regression Model

# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)

# Prepare Table
Pareto <- (SSdf$SumSq)
names(Pareto) <- SSdf$Component

# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5  # Adjust x-position if needed
line_y <- SSdf$CumSum

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
                     ylab = "Sum of Squares", # label y left
 
  # colors of the chart            
  col = Colours,
   
  # ranges of the percentages at the right
  cumperc = seq(0, 100, by = 20),
   
  # label y right
  ylab2 = "Cumulative Percentage",
   
  # title of the chart
  main     = "Sum of Squares from Regression Model Summary",
  subtitle = "Based on Cereal Data",
  
  # font size
  cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)

##              
## Pareto chart analysis for Pareto
##               Frequency Cum.Freq. Percentage Cum.Percent.
##   Calories     4235.020  4235.020     41.045       41.045
##   Protein      2997.566  7232.586     29.052       70.098
##   Sugars       1415.494  8648.080     13.719       83.816
##   Potass        483.082  9131.162      4.682       88.498
##   Sodium        454.512  9585.674      4.405       92.904
##   Residuals     395.343  9981.017      3.832       96.735
##   Fat+Vitamin   336.863 10317.880      3.265      100.000

# Plot the points on the cumulative percentage line
lines(line_x, line_y, pch = 30, col = "red") 

# Add text labels to the cumulative percentage line
text(line_x, line_y, labels = round(CumSum * 100, 1), pos = 3, col = "red")

1.26 Make Predictions on Testing Data

# Make predictions on the testing data
predictions <- predict(LinReg06, newdata = CerealsTest)

1.27 Evaluate Performance Metrics

Use performance metrics such as R-squared, RMSE, and MAE to assess model accuracy. Compare predicted values with actual values to measure predictive power. These metrics help in comparing models and improving predictions.

# Calculate RMSE
predictions <- predict(LinReg06, newdata = CerealsTrain)
actual <- CerealsTrain$Rating
RMSE <- sqrt(mean((predictions - actual)^2))
cat("\n\nRMSE of Cereal Train: ", RMSE)

## 
## 
## RMSE of Cereal Train:  2.546

predictions <- predict(LinReg06, newdata = CerealsTest)
actual <- CerealsTest$Rating
RMSE <- sqrt(mean((predictions - actual)^2))
cat("\n\nRMSE of Cereal Test : ", RMSE)

## 
## 
## RMSE of Cereal Test :  3.937

1.28 Plot All Residual Plots

Plot actual vs. predicted values to assess model fit visually. Ideally, points should align closely with the diagonal line, indicating good predictive performance. This visualisation helps to detect systematic errors and trends.

# Add Model Data to CerealsTrain
CerealsTrain$Resid  <- LinReg06$residuals
CerealsTrain$Fit    <- LinReg06$fitted.values
MeanRes             <- mean(CerealsTrain$Resid)
SDRes               <- sd(CerealsTrain$Resid)
CerealsTrain$StdRes <- (CerealsTrain$Resid - MeanRes) / SDRes

# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))

# Residuals vs Fitted
plot(CerealsTrain$Fit, CerealsTrain$StdRes, main = "Std Residuals vs Fitted", xlab = "Fitted values", ylab = "Std Residuals")
abline(h = 0, col = "red")

# Normal Q-Q
qqnorm(LinReg06$residuals, main = "Normal Q-Q")
qqline(LinReg06$residuals, col = "red")

# Residuals vs Time (ID)
plot(CerealsTrain$Id, CerealsTrain$StdRes,      main = "Residuals over Time (Sequence of data)",      xlab = "Observation Order (ID)",      ylab = "Residuals")
abline(h = 0, col = "red")

# Calculating Cook's Distance
cutoff <- 4/(nrow(CerealsTrain)-length(LinReg06$coefficients)-2)

# Plot Cook's Distance for each observation
plot(LinReg06, which = 4, cook.levels = cutoff)
abline(h = cutoff, lty = 2, col = "red")

# Reset plotting environment
par(mfrow = c(1, 1))

2 Model Evaluation Techniques for Classification and Regression Trees

2.1 Get Data for Predicting Whether Income is < 50k

# Download Data from URL

2.2 Prepare Data

# Collapse Categrories
levels(Adults$marital.status)[2:4] <- "Married"
levels(Adults$workclass)[c(2,3,8)] <- "Gov"
levels(Adults$workclass)[c(5, 6)]  <- "Self"


# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Prepare Columns
names(Adults) <- str_to_title(names(Adults))
names(Adults) <- substr(names(Adults), 1, 6)

colnames(Adults)[2]  <- "Class"
colnames(Adults)[5]  <- "EduNum"
colnames(Adults)[6]  <- "Marital"
colnames(Adults)[7]  <- "Occupat"
colnames(Adults)[8]  <- "Relationship"
colnames(Adults)[11] <- "CapGain"
colnames(Adults)[12] <- "CapLoss"
colnames(Adults)[13] <- "WeekHours"

# Change Column
Adults$Income <- ifelse(Adults$Income == ">50K", "Earns >50k", "Earns <=50k")

# Show 4 Rows of Dataset
head(Adults, 4)

##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native      Income
## 1    Own-child Black   Male       0       0        40 United-States Earns <=50k
## 2      Husband White   Male       0       0        50 United-States Earns <=50k
## 3      Husband White   Male       0       0        40 United-States  Earns >50k
## 4      Husband Black   Male    7688       0        40 United-States  Earns >50k

# Write data to working directory
write.csv(Adults, file = "Adults.csv")

2.3 Explore Data Frame

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Adults)

##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Adults)

## 'data.frame':    48842 obs. of  15 variables:
##  $ Age         : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ Class       : Factor w/ 6 levels "?","Gov","Never-worked",..: 4 4 2 4 1 4 1 5 4 4 ...
##  $ Fnlwgt      : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ Educat      : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ EduNum      : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ Marital     : Factor w/ 5 levels "Divorced","Married",..: 3 2 2 2 3 3 3 2 3 2 ...
##  $ Occupat     : Factor w/ 15 levels "?","Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ Relationship: Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ Race        : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ Gender      : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ CapGain     : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ CapLoss     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ WeekHours   : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Native      : Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 40 ...
##  $ Income      : chr  "Earns <=50k" "Earns <=50k" "Earns >50k" "Earns >50k" ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Adults, 5)

##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
## 5  18       ? 103497 Some-college     10 Never-married                 ?
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native      Income
## 1    Own-child Black   Male       0       0        40 United-States Earns <=50k
## 2      Husband White   Male       0       0        50 United-States Earns <=50k
## 3      Husband White   Male       0       0        40 United-States  Earns >50k
## 4      Husband Black   Male    7688       0        40 United-States  Earns >50k
## 5    Own-child White Female       0       0        30 United-States Earns <=50k

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Adults, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##                           Variable     N    Mean     SD   Min    P25    P50
## 1                              Age 48842  38.644 13.711    17     28     37
## 2                            Class 48842                                   
## 3                            ... ?  2799  5.731%                           
## 4                          ... Gov  6549 13.409%                           
## 5                 ... Never-worked    10   0.02%                           
## 6                      ... Private 33906  69.42%                           
## 7                         ... Self  5557 11.378%                           
## 8                  ... Without-pay    21  0.043%                           
## 9                           Fnlwgt 48842  189664 105604 12285 117551 178145
## 10                          Educat 48842                                   
## 11                        ... 10th  1389  2.844%                           
## 12                        ... 11th  1812   3.71%                           
## 13                        ... 12th   657  1.345%                           
## 14                     ... 1st-4th   247  0.506%                           
## 15                     ... 5th-6th   509  1.042%                           
## 16                     ... 7th-8th   955  1.955%                           
## 17                         ... 9th   756  1.548%                           
## 18                  ... Assoc-acdm  1601  3.278%                           
## 19                   ... Assoc-voc  2061   4.22%                           
## 20                   ... Bachelors  8025 16.431%                           
## 21                   ... Doctorate   594  1.216%                           
## 22                     ... HS-grad 15784 32.316%                           
## 23                     ... Masters  2657   5.44%                           
## 24                   ... Preschool    83   0.17%                           
## 25                 ... Prof-school   834  1.708%                           
## 26                ... Some-college 10878 22.272%                           
## 27                          EduNum 48842  10.078  2.571     1      9     10
## 28                         Marital 48842                                   
## 29                    ... Divorced  6633 13.581%                           
## 30                     ... Married 23044 47.181%                           
## 31               ... Never-married 16117 32.998%                           
## 32                   ... Separated  1530  3.133%                           
## 33                     ... Widowed  1518  3.108%                           
## 34                         Occupat 48842                                   
## 35                           ... ?  2809  5.751%                           
## 36                ... Adm-clerical  5611 11.488%                           
## 37                ... Armed-Forces    15  0.031%                           
## 38                ... Craft-repair  6112 12.514%                           
## 39             ... Exec-managerial  6086 12.461%                           
## 40             ... Farming-fishing  1490  3.051%                           
## 41           ... Handlers-cleaners  2072  4.242%                           
## 42           ... Machine-op-inspct  3022  6.187%                           
## 43               ... Other-service  4923 10.079%                           
## 44             ... Priv-house-serv   242  0.495%                           
## 45              ... Prof-specialty  6172 12.637%                           
## 46             ... Protective-serv   983  2.013%                           
## 47                       ... Sales  5504 11.269%                           
## 48                ... Tech-support  1446  2.961%                           
## 49            ... Transport-moving  2355  4.822%                           
## 50                    Relationship 48842                                   
## 51                     ... Husband 19716 40.367%                           
## 52               ... Not-in-family 12583 25.763%                           
## 53              ... Other-relative  1506  3.083%                           
## 54                   ... Own-child  7581 15.521%                           
## 55                   ... Unmarried  5125 10.493%                           
## 56                        ... Wife  2331  4.773%                           
## 57                            Race 48842                                   
## 58          ... Amer-Indian-Eskimo   470  0.962%                           
## 59          ... Asian-Pac-Islander  1519   3.11%                           
## 60                       ... Black  4685  9.592%                           
## 61                       ... Other   406  0.831%                           
## 62                       ... White 41762 85.504%                           
## 63                          Gender 48842                                   
## 64                      ... Female 16192 33.152%                           
## 65                        ... Male 32650 66.848%                           
## 66                         CapGain 48842  1079.1   7452     0      0      0
## 67                         CapLoss 48842  87.502    403     0      0      0
## 68                       WeekHours 48842  40.422 12.391     1     40     40
## 69                          Native 48842                                   
## 70                           ... ?   857  1.755%                           
## 71                    ... Cambodia    28  0.057%                           
## 72                      ... Canada   182  0.373%                           
## 73                       ... China   122   0.25%                           
## 74                    ... Columbia    85  0.174%                           
## 75                        ... Cuba   138  0.283%                           
## 76          ... Dominican-Republic   103  0.211%                           
## 77                     ... Ecuador    45  0.092%                           
## 78                 ... El-Salvador   155  0.317%                           
## 79                     ... England   127   0.26%                           
## 80                      ... France    38  0.078%                           
## 81                     ... Germany   206  0.422%                           
## 82                      ... Greece    49    0.1%                           
## 83                   ... Guatemala    88   0.18%                           
## 84                       ... Haiti    75  0.154%                           
## 85          ... Holand-Netherlands     1  0.002%                           
## 86                    ... Honduras    20  0.041%                           
## 87                        ... Hong    30  0.061%                           
## 88                     ... Hungary    19  0.039%                           
## 89                       ... India   151  0.309%                           
## 90                        ... Iran    59  0.121%                           
## 91                     ... Ireland    37  0.076%                           
## 92                       ... Italy   105  0.215%                           
## 93                     ... Jamaica   106  0.217%                           
## 94                       ... Japan    92  0.188%                           
## 95                        ... Laos    23  0.047%                           
## 96                      ... Mexico   951  1.947%                           
## 97                   ... Nicaragua    49    0.1%                           
## 98  ... Outlying-US(Guam-USVI-etc)    23  0.047%                           
## 99                        ... Peru    46  0.094%                           
## 100                ... Philippines   295  0.604%                           
## 101                     ... Poland    87  0.178%                           
## 102                   ... Portugal    67  0.137%                           
## 103                ... Puerto-Rico   184  0.377%                           
## 104                   ... Scotland    21  0.043%                           
## 105                      ... South   115  0.235%                           
## 106                     ... Taiwan    65  0.133%                           
## 107                   ... Thailand    30  0.061%                           
## 108            ... Trinadad&Tobago    27  0.055%                           
## 109              ... United-States 43832 89.742%                           
## 110                    ... Vietnam    86  0.176%                           
## 111                 ... Yugoslavia    23  0.047%                           
## 112                         Income 48842                                   
## 113                ... Earns <=50k 37155 76.072%                           
## 114                 ... Earns >50k 11687 23.928%                           
##        P75     Max NA Mode
## 1       48      90  0     
## 2                         
## 3                         
## 4                         
## 5                         
## 6                         
## 7                         
## 8                         
## 9   237642 1490400  0     
## 10                        
## 11                        
## 12                        
## 13                        
## 14                        
## 15                        
## 16                        
## 17                        
## 18                        
## 19                        
## 20                        
## 21                        
## 22                        
## 23                        
## 24                        
## 25                        
## 26                        
## 27      12      16  0     
## 28                        
## 29                        
## 30                        
## 31                        
## 32                        
## 33                        
## 34                        
## 35                        
## 36                        
## 37                        
## 38                        
## 39                        
## 40                        
## 41                        
## 42                        
## 43                        
## 44                        
## 45                        
## 46                        
## 47                        
## 48                        
## 49                        
## 50                        
## 51                        
## 52                        
## 53                        
## 54                        
## 55                        
## 56                        
## 57                        
## 58                        
## 59                        
## 60                        
## 61                        
## 62                        
## 63                        
## 64                        
## 65                        
## 66       0   99999  0     
## 67       0    4356  0     
## 68      45      99  0     
## 69                        
## 70                        
## 71                        
## 72                        
## 73                        
## 74                        
## 75                        
## 76                        
## 77                        
## 78                        
## 79                        
## 80                        
## 81                        
## 82                        
## 83                        
## 84                        
## 85                        
## 86                        
## 87                        
## 88                        
## 89                        
## 90                        
## 91                        
## 92                        
## 93                        
## 94                        
## 95                        
## 96                        
## 97                        
## 98                        
## 99                        
## 100                       
## 101                       
## 102                       
## 103                       
## 104                       
## 105                       
## 106                       
## 107                       
## 108                       
## 109                       
## 110                       
## 111                       
## 112                       
## 113                       
## 114

2.4 Standardise Numeric Variables and Split Data

# Standardise Numeric Variables
Adults$Age.z       <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
Adults$EduNum.z    <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
Adults$CapGain.z   <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
Adults$CapLoss.z   <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Adults)

##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"      
## [16] "Age.z"        "EduNum.z"     "CapGain.z"    "CapLoss.z"    "WeekHours.z"

# Split data into train and test
set.seed(123)
split <- initial_split(Adults, prop = 0.80)
AdultsTrain <- training(split)
AdultsTest  <- testing(split)

2.5 Use Predictors to Classify Whether or not a Person’s Income is less than $50K

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)

# Calculate the Income Tree
IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "class")
print(IncomeTree)

## n= 39073 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 39073 9401 Earns <=50k (0.75940 0.24060)  
##    2) Marital=Divorced,Never-married,Separated,Widowed 20577 1311 Earns <=50k (0.93629 0.06371)  
##      4) CapGain.z< 0.802 20208  957 Earns <=50k (0.95264 0.04736) *
##      5) CapGain.z>=0.802 369   15 Earns >50k (0.04065 0.95935) *
##    3) Marital=Married 18496 8090 Earns <=50k (0.56261 0.43739)  
##      6) EduNum.z< 0.942 13047 4235 Earns <=50k (0.67540 0.32460)  
##       12) CapGain.z< 0.539 12408 3608 Earns <=50k (0.70922 0.29078)  
##         24) EduNum.z< -0.6138 2155  222 Earns <=50k (0.89698 0.10302) *
##         25) EduNum.z>=-0.6138 10253 3386 Earns <=50k (0.66976 0.33024)  
##           50) CapLoss.z< 4.363 9842 3076 Earns <=50k (0.68746 0.31254) *
##           51) CapLoss.z>=4.363 411  101 Earns >50k (0.24574 0.75426) *
##       13) CapGain.z>=0.539 639   12 Earns >50k (0.01878 0.98122) *
##      7) EduNum.z>=0.942 5449 1594 Earns >50k (0.29253 0.70747) *

2.6 Plot Decision Tree using Cart

# Plot the Decsion Tree
cat("\n\nThe following information is shown in each node:\n")

## 
## 
## The following information is shown in each node:

cat("- Node Number\n")

## - Node Number

cat("- Predicted class\n")

## - Predicted class

cat("- Predicted probability of this class\n")

## - Predicted probability of this class

cat("- Percentage of Observations in the node\n")

## - Percentage of Observations in the node

rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

2.7 Check the Order of Variable Importance

# Checking the order of variable importance
IncomeTree$variable.importance

##     Marital   CapGain.z    EduNum.z      Gender       Age.z WeekHours.z 
##      2720.3      1444.1      1310.8       928.9       745.8       399.3 
##       Class   CapLoss.z 
##       255.6       165.7

2.8 Predict Using Classification Tree

# Calculate Predicted Values
AdultsTest$Pred = predict(IncomeTree, AdultsTest, type = "class")

2.9 Plot model accuracy vs different values of cp (Complexity Parameter)

# Fit the model on the training set
set.seed(123)
IncomeTree <- train(
  Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "rpart", trControl = trainControl("cv", number = 10), tuneLength = 10
  )

# Plot model accuracy vs different values of cp (complexity parameter)
plot(IncomeTree,
     main = "Model Accuracy vs Different Values of cp (Model Complexity)", 
     pch = 16, 
     col = "blue"
)

cat("\n\nThe complexity parameter (cp) plays a crucial role in controlling the size of the decision tree.\n")

## 
## 
## The complexity parameter (cp) plays a crucial role in controlling the size of the decision tree.

cat("A smaller cp value results in a more complex tree with more splits. A larger cp value results in a simpler tree with fewer splits.\n\n")

## A smaller cp value results in a more complex tree with more splits. A larger cp value results in a simpler tree with fewer splits.

2.10 Print the Best Tuning Parameter

# Print the best tuning parameter cp that maximizes the model accuracy
cat("\n\nThe Best Tuning Parameter cp that Maximizes Model Accuracy is:")

## 
## 
## The Best Tuning Parameter cp that Maximizes Model Accuracy is:

IncomeTree$bestTune

##          cp
## 1 0.0009573

2.11 Plot the Most Accurate Model

# Plot the final tree model
# par(xpd = NA) # Avoid clipping the text in some device
# plot(IncomeTree2$finalModel)
# text(IncomeTree2$finalModel,  digits = 3)

IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "class")

# Decision rules in the model
rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

2.12 Evaluate the Performance of the Classification Tree

# Evaluate the Performance of the Classification Tree
Predicted <- AdultsTest$Pred
Actual <- AdultsTest$Income
table(Predicted, Actual)

##              Actual
## Predicted     Earns <=50k Earns >50k
##   Earns <=50k        7076       1048
##   Earns >50k          407       1238

# Compute model accuracy rate on test data
ModAcc <- mean(Predicted == Actual)
cat("\n\nThe overall accuracy of our tree model is", ModAcc)

## 
## 
## The overall accuracy of our tree model is 0.8511

2.13 Check Whether Pruning Reduces the Error Rate

# Check Whether Pruning Reduces the Error Rate
printcp(IncomeTree)

## 
## Classification tree:
## rpart(formula = Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + 
##     WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, 
##     method = "class")
## 
## Variables actually used in tree construction:
## [1] CapGain.z CapLoss.z EduNum.z  Marital  
## 
## Root node error: 9401/39073 = 0.24
## 
## n= 39073 
## 
##      CP nsplit rel error xerror   xstd
## 1 0.120      0      1.00   1.00 0.0090
## 2 0.065      2      0.76   0.76 0.0081
## 3 0.036      3      0.69   0.69 0.0078
## 4 0.011      4      0.66   0.66 0.0077
## 5 0.010      6      0.64   0.64 0.0076

# Explicitly request the lowest cp value
LowCP <- IncomeTree$cptable[which.min(IncomeTree$cptable[,"xerror"]),"CP"]
cat("\n\nThe Lowest Complexity Parameter (CP) is", LowCP, ".\n")

## 
## 
## The Lowest Complexity Parameter (CP) is 0.01 .

2.14 Applying the pruned tree, the tree with the lowest cp value

# Applying the pruned tree, the tree with the lowest cp value, to the model that was built using the training data set
bestcp <- IncomeTree$cptable[which.min(IncomeTree$cptable[,"xerror"]),"CP"]
PrunedTree <- prune(IncomeTree, cp = bestcp)
rpart.plot(PrunedTree, main = "Classification Tree: Is a Person Likely to Earn 50k?", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

2.15 Use the Pruned Tree for Prediction

# Alternate specification 
AdultsTest$PrunedPred = predict(PrunedTree, AdultsTest, type = "class")

# Evaluate the Performance of the Pruned Tree
PrunedPred <- AdultsTest$PrunedPred
table(PrunedPred, Actual)

##              Actual
## PrunedPred    Earns <=50k Earns >50k
##   Earns <=50k        7076       1048
##   Earns >50k          407       1238

3 Model Evaluation Techniques for Classification and Regression Trees

3.1 Get Data for Wine Quality in Portugal

# Download Wine Data from URL

##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"             
## [13] "type"

3.2 Prepare Data

# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Prepare Column Names
colnames(Wines)[1]  <- "FixAcid"
colnames(Wines)[2]  <- "VolAcid"
colnames(Wines)[3]  <- "CitricAcid"
colnames(Wines)[4]  <- "ResSugar"
colnames(Wines)[5]  <- "Chlorid"
colnames(Wines)[6]  <- "FreeSulf"
colnames(Wines)[7]  <- "TotalSulf"
colnames(Wines)[8]  <- "Density"
colnames(Wines)[9]  <- "PH"
colnames(Wines)[10] <- "Sulphates"
colnames(Wines)[11] <- "Alcohol"
colnames(Wines)[12] <- "Quality"
colnames(Wines)[13] <- "Type"

# Add ID Column
Wines$ID <- 1:nrow(Wines)

# Show 4 Rows of Dataset
head(Wines, 4)

##   FixAcid VolAcid CitricAcid ResSugar Chlorid FreeSulf TotalSulf Density   PH
## 1     7.0    0.27       0.36     20.7   0.045       45       170  1.0010 3.00
## 2     6.3    0.30       0.34      1.6   0.049       14       132  0.9940 3.30
## 3     8.1    0.28       0.40      6.9   0.050       30        97  0.9951 3.26
## 4     7.2    0.23       0.32      8.5   0.058       47       186  0.9956 3.19
##   Sulphates Alcohol Quality  Type ID
## 1      0.45     8.8       6 White  1
## 2      0.49     9.5       6 White  2
## 3      0.44    10.1       6 White  3
## 4      0.40     9.9       6 White  4

# Write data to working directory
write.csv(Wines, file = "Wines.csv")

3.3 Explore Data Frame

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Wines)

##  [1] "FixAcid"    "VolAcid"    "CitricAcid" "ResSugar"   "Chlorid"   
##  [6] "FreeSulf"   "TotalSulf"  "Density"    "PH"         "Sulphates" 
## [11] "Alcohol"    "Quality"    "Type"       "ID"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Wines)

## 'data.frame':    6497 obs. of  14 variables:
##  $ FixAcid   : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ VolAcid   : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ CitricAcid: num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ ResSugar  : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ Chlorid   : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ FreeSulf  : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ TotalSulf : num  170 132 97 186 186 97 136 170 132 129 ...
##  $ Density   : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ PH        : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ Sulphates : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ Alcohol   : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ Quality   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Type      : chr  "White" "White" "White" "White" ...
##  $ ID        : int  1 2 3 4 5 6 7 8 9 10 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Wines, 5)

##   FixAcid VolAcid CitricAcid ResSugar Chlorid FreeSulf TotalSulf Density   PH
## 1     7.0    0.27       0.36     20.7   0.045       45       170  1.0010 3.00
## 2     6.3    0.30       0.34      1.6   0.049       14       132  0.9940 3.30
## 3     8.1    0.28       0.40      6.9   0.050       30        97  0.9951 3.26
## 4     7.2    0.23       0.32      8.5   0.058       47       186  0.9956 3.19
## 5     7.2    0.23       0.32      8.5   0.058       47       186  0.9956 3.19
##   Sulphates Alcohol Quality  Type ID
## 1      0.45     8.8       6 White  1
## 2      0.49     9.5       6 White  2
## 3      0.44    10.1       6 White  3
## 4      0.40     9.9       6 White  4
## 5      0.40     9.9       6 White  5

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Wines, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##      Variable    N     Mean        SD     Min     P25     P50     P75   Max NA
## 1     FixAcid 6497   7.2153    1.2964     3.8     6.4       7     7.7  15.9  0
## 2     VolAcid 6497  0.33967   0.16464    0.08    0.23    0.29     0.4  1.58  0
## 3  CitricAcid 6497  0.31863   0.14532       0    0.25    0.31    0.39  1.66  0
## 4    ResSugar 6497   5.4432    4.7578     0.6     1.8       3     8.1  65.8  0
## 5     Chlorid 6497 0.056034  0.035034   0.009   0.038   0.047   0.065 0.611  0
## 6    FreeSulf 6497   30.525    17.749       1      17      29      41   289  0
## 7   TotalSulf 6497   115.74    56.522       6      77     118     156   440  0
## 8     Density 6497   0.9947 0.0029987 0.98711 0.99234 0.99489 0.99699 1.039  0
## 9          PH 6497   3.2185   0.16079    2.72    3.11    3.21    3.32  4.01  0
## 10  Sulphates 6497  0.53127   0.14881    0.22    0.43    0.51     0.6     2  0
## 11    Alcohol 6497   10.492    1.1927       8     9.5    10.3    11.3  14.9  0
## 12    Quality 6497   5.8184   0.87326       3       5       6       6     9  0
## 13       Type 6497                                                            
## 14    ... Red 1599  24.611%                                                   
## 15  ... White 4898  75.389%                                                   
## 16         ID 6497     3249    1875.7       1    1625    3249    4873  6497  0
##    Mode
## 1      
## 2      
## 3      
## 4      
## 5      
## 6      
## 7      
## 8      
## 9      
## 10     
## 11     
## 12     
## 13     
## 14     
## 15     
## 16

RedMean   <- mean(Wines$Quality[Wines$Type == "Red"])
WhiteMean <- mean(Wines$Quality[Wines$Type == "White"])

3.4 Standardise Numeric Variables and Split Data

# Standardise Numeric Variables
# Adults$Age.z       <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
# Adults$EduNum.z    <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
# Adults$CapGain.z   <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
# Adults$CapLoss.z   <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
# Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Wines)

##  [1] "FixAcid"    "VolAcid"    "CitricAcid" "ResSugar"   "Chlorid"   
##  [6] "FreeSulf"   "TotalSulf"  "Density"    "PH"         "Sulphates" 
## [11] "Alcohol"    "Quality"    "Type"       "ID"

3.5 Plot Result Variable

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")

# Plot Result Variable
ggplot(Wines, 
  aes(factor(Quality))) +
  geom_bar(fill = "#7c0c18") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'none',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 16),
    # axis.text.x=element_blank(), #remove x axis labels
    # axis.ticks.x=element_blank(), #remove x axis ticks
    # axis.text.y=element_blank(),  #remove y axis labels
    #axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Distribution of Wine Quality",
       subtitle = "Based on Real Data from Portugal",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "Quality Score", 
       y = "Count of Wines")

3.6 Check Whether Red and White Wine Quality is Different

# Load Library if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)

# Build BoxPlot
boxplot(Quality ~ Type, data = Wines,
        main = "Wine Quality by Type", 
        xlab = "Quality Indicator", 
        ylab = "Type", 
        col = c("darkred", "yellow"), 
        border = "brown",
        horizontal = FALSE,
        notch = TRUE
)

# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)

WinesUnstacked <- Wines %>%
  group_by(Type, Quality) %>%
  summarise(Count = n(), .groups = 'drop')

# Build BarPlot
barplot(
    # Plot data
    Count ~ Type + Quality, data = WinesUnstacked, beside = TRUE,
    # Axis labels
    xlab = "Type of Wine and Quality Score",
    ylab = "Count of Wines",
    ylim = c(0, 2500),
    axes = TRUE, 
    # Graph title
    main = "Wine Quality by Type",
    subtitle = "Based on Real Data from Portugal",
    # Add colour
    col = c("darkred", "beige"),
    # Add a legend
) 
# Add labels with some offset to be above the bar
# text(x = , y = WinesUnstacked$Count,labels = round(WinesUnstacked$Count, digits = 2))
# Add Legend
legend("topleft", fill = c("darkred", "beige"), legend = c("Red Wine", "White Wine"), 
    horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")
# Draw a box outline
box()
# Draw Gridlines
grid(NA, 5,
     leg.txt, 
     lty = 3,             # Grid line type
     col = "lightgray",   # Grid line color
     lwd = 2)             # Grid line width

# Check Normality
cat("\n\nNormality Test for Red Wine: ")

## 
## 
## Normality Test for Red Wine:

spvalue1 <- shapiro.test(subset(Wines, Type == "Red")$Quality)$p.value
if (spvalue1 < 0.05) { 
       cat("\nThe Red Wine Quality score is not normally distributed.") 
  } else {
       cat("\nThe Red Wine Quality score is normally distributed.") 
  }

## 
## The Red Wine Quality score is not normally distributed.

cat("\n\nNormality Test for White Wine: ")

## 
## 
## Normality Test for White Wine:

spvalue2 <- shapiro.test(subset(Wines, Type == "White")$Quality)$p.value
if (spvalue2 < 0.05) { 
       cat("\nThe White Wine Quality score is not normally distributed.") 
  } else {
       cat("\nThe White Wine Quality score is normally distributed.") 
  }

## 
## The White Wine Quality score is not normally distributed.

if (spvalue1 < 0.05 | spvalue2 < 0.05) {

  # Perform Test on Difference
  testOnDiff <- "Wilcoxon Rank Sum Test"
  wtest    <- wilcox.test(Wines$Quality ~ Wines$Type)
  pvalue   <- round(wtest$p.value, 4)

} else {
  
  # Perform Test on Difference
  testOnDiff <- "Student's t-Test"
  ttest    <- t.test(Quality ~ Type, data = Wines)
  pvalue   <- round(ttest$p.value, 4)
  diff1    <- abs(ttest$conf.int[1])
  diff2    <- abs(ttest$conf.int[2])
  
  if (diff1 > diff2) {
    diff0 <- diff2
    diff2 <- diff1
    diff1 <- diff0
  }
}


cat("\n\nBased on", testOnDiff, "the p-value for assuming a difference between Quality of Red and White Wine is:", pvalue, ". ")

## 
## 
## Based on Wilcoxon Rank Sum Test the p-value for assuming a difference between Quality of Red and White Wine is: 0 .

if (pvalue < 0.05) { 
       cat("This means, there is a significant difference between the Quality of Red and White Wine.") 
       cat("\n\nThe average Quality rating for White Wine is", WhiteMean, "and for Red Wine is" , RedMean, ".") 
       if (spvalue1 > 0.05 & spvalue2 > 0.05) {
         cat("\nThe difference is between", round(tdiff1, 4), "and", round((tdiff2), 4), ".") 
       }
  } else {
       cat("This means, there is no significant difference between the Quality of Red and White Wine.")
  }

## This means, there is a significant difference between the Quality of Red and White Wine.
## 
## The average Quality rating for White Wine is 5.878 and for Red Wine is 5.636 .

3.7 Collapse and Plot Result Variable

# Collapse 3 and 9
Wines$QualityFull <- Wines$Quality
Wines$QualityN    <- case_when(Wines$Quality == 3 ~ 4,
                            Wines$Quality == 9 ~ 8,
                           !Wines$Quality %in% c(3,9) ~ Wines$Quality)
Wines$Quality <- as.factor(Wines$Quality)
Wines <- as.data.frame(Wines[, -15])

names(Wines)

##  [1] "FixAcid"    "VolAcid"    "CitricAcid" "ResSugar"   "Chlorid"   
##  [6] "FreeSulf"   "TotalSulf"  "Density"    "PH"         "Sulphates" 
## [11] "Alcohol"    "Quality"    "Type"       "ID"         "QualityN"

# Plot Result Variable
ggplot(Wines, 
  aes(factor(QualityN))) +
  geom_bar(fill = "darkred") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'none',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 16),
    # axis.text.x=element_blank(), #remove x axis labels
    # axis.ticks.x=element_blank(), #remove x axis ticks
    # axis.text.y=element_blank(),  #remove y axis labels
    #axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Distribution of Wine Quality after Collapsing 3 and 9",
       subtitle = "Based on Real Data from Portugal",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "Quality Score", 
       y = "Count of Wines")

# Split data into train and test
set.seed(123)
split <- initial_split(Wines, prop = 0.80)
WinesTrain <- training(split)
WinesTest  <- testing(split)

3.8 Build Decision Tree 5.0

# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)

# Build Decision Tree 5.0
WineTree <- C5.0(Quality ~ ., 
                 data = WinesTrain[, -15])

WineTree

## 
## Call:
## C5.0.formula(formula = Quality ~ ., data = WinesTrain[, -15])
## 
## Classification Tree
## Number of samples: 5197 
## Number of predictors: 13 
## 
## Tree size: 635 
## 
## Non-standard options: attempt to group attributes

3.9 Winnowing and Boosting Tree

WineTreeW <- C5.0(Quality ~ ., data = WinesTrain[, -15], 
                       winnow = TRUE, 
                       trials = 5)

WineTreeW

## 
## Call:
## C5.0.formula(formula = Quality ~ ., data = WinesTrain[, -15], winnow =
##  TRUE, trials = 5)
## 
## Classification Tree
## Number of samples: 5197 
## Number of predictors: 13 
## 
## Number of boosting iterations: 5 
## Average tree size: 536.4 
## 
## Non-standard options: attempt to group attributes

3.10 Comparing Performance

# Load Library if not available
if(! "yardstick" %in% installed.packages()) { install.packages("yardstick", dependencies = TRUE) }
library(yardstick)

class_wines <- tibble(value = WinesTrain$Quality, 
                      predict   = predict(WineTree, WinesTrain),
                      predictw = predict(WineTreeW, WinesTrain))

class_metrics <- metric_set(accuracy, precision, recall)

# Show confusion matrix
cat("\n\nShow Confusion Matrix:\n")

## 
## 
## Show Confusion Matrix:

conf_mat(class_wines, truth = value, estimate = predict)

##           Truth
## Prediction    3    4    5    6    7    8    9
##          3    8    2    1    0    0    0    0
##          4    2  111    8    5    7    1    0
##          5    3   26 1482  155   31    5    0
##          6    6   22  217 2048  112   21    1
##          7    3    4   12   42  716   27    2
##          8    0    0    2    5    6  103    1
##          9    0    0    0    0    0    0    0

cat("\n\nShow Performance Metrics of Original Model:\n")

## 
## 
## Show Performance Metrics of Original Model:

class_metrics(class_wines, truth = value, estimate = predict)

## # A tibble: 3 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  multiclass     0.860
## 2 precision macro          0.840
## 3 recall    macro          0.612

cat("\n\nShow Performance Metrics of Winnowing/Boosting Model:\n")

## 
## 
## Show Performance Metrics of Winnowing/Boosting Model:

class_metrics(class_wines, truth = value, estimate = predictw)

## # A tibble: 3 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  multiclass     0.975
## 2 precision macro          0.982
## 3 recall    macro          0.934

# Save Predictions in WinesTrain
WinesTrain$Pred  <- predict(WineTree, WinesTrain)
WinesTrain$PredW <- predict(WineTreeW, WinesTrain)

# Write data to working directory
write.csv(WinesTrain, file = "WinesTrain.csv")

3.11 Show Variable Importance

# Show Variable Importance
cat("\n\nShow Variable Importance for Original Model:\n")

## 
## 
## Show Variable Importance for Original Model:

C5imp(WineTree)

##            Overall
## Alcohol     100.00
## VolAcid      84.68
## FreeSulf     77.83
## FixAcid      71.41
## Sulphates    65.79
## PH           63.40
## CitricAcid   53.43
## Chlorid      51.86
## ResSugar     43.06
## ID           40.35
## Density      38.10
## TotalSulf    37.68
## Type         21.95

cat("\n\nShow Variable Importance for Model after Winnowing/Boosting:\n")

## 
## 
## Show Variable Importance for Model after Winnowing/Boosting:

C5imp(WineTreeW)

##            Overall
## VolAcid     100.00
## Alcohol     100.00
## Sulphates    99.31
## PH           98.85
## TotalSulf    98.06
## FreeSulf     97.58
## Chlorid      97.54
## FixAcid      97.38
## CitricAcid   96.06
## ID           93.71
## ResSugar     93.53
## Density      87.20
## Type         63.21

3.12 Calculate MSE

# Calculate the MSE for the original tree
mse <- mean((as.numeric(as.character(WinesTrain$Pred)) - as.numeric(as.character(WinesTrain$Quality)))^2)
cat("\n\nMSE of the Original Tree is", mse)

## 
## 
## MSE of the Original Tree is 0.2492

# Calculate the MSE for the pruned tree
mse <- mean((as.numeric(as.character(WinesTrain$PredW)) - as.numeric(as.character(WinesTrain$Quality)))^2)
cat("\n\nMSE of the Winnowed/Boosted Tree is", mse)

## 
## 
## MSE of the Winnowed/Boosted Tree is 0.04175

3.13 Extract Importance Data from Tree

# Extract Importance Data from Tree
C5TreeImp <- as.data.frame(C5imp(WineTree, metric = "usage"))
VarImp <- as.data.frame(C5TreeImp$Overall)
VarImp$Variable   <- rownames(C5TreeImp)
colnames(VarImp)[1] <- "Importance"


C5TreeImpW <- as.data.frame(C5imp(WineTreeW, metric = "usage"))
VarImpW <- as.data.frame(C5TreeImpW$Overall)
VarImpW$Variable <- rownames(C5TreeImpW)
colnames(VarImpW)[1] <- "Importance"

3.14 Show Column Chart of Importance of Factors for Original Tree

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Create the Column Chart
ggplot(VarImp, aes(y = reorder(Variable, Importance), x = Importance, label = Variable, fill = reorder(Variable, Importance), desc = FALSE)) +
  geom_col(stat = "identity", fill = "#7c0c18") +
  geom_text(
    aes(label = paste(round(Importance, 0), "%")), 
    hjust = 1, nudge_x = -.5,
    size = 5, fontface = "bold", family = "Fira Sans", color = "white") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 5),
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    plot.margin = margin(rep(15, 4)),
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "lightgrey")
  ) +
  labs(title = "Importance of Model Variables",
       subtitle = "Based on Original Classification Tree of Wine Quality Data",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

3.15 Show Column Chart of Importance of Factors for Tree after Winnowing/Boosting Tree

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Create the Column Chart
ggplot(VarImpW, aes(y = reorder(Variable, Importance), x = Importance, label = Variable, fill = reorder(Variable, Importance), desc = FALSE)) +
  geom_col(stat = "identity", fill = "#7c0c18") +
  geom_text(
    aes(label = paste(round(Importance, 0), "%")), 
    hjust = 1, nudge_x = -.5,
    size = 5, fontface = "bold", family = "Fira Sans", color = "white") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 5),
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    plot.margin = margin(rep(15, 4)),
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "lightgrey")
  ) +
  labs(title = "Importance of Model Variables After Winnowing/Boosting",
       subtitle = "Based on Tree after Winnowing/Boosting",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

3.16 Showing Performance of Test Data

# Save Predictions in WinesTest
WinesTest$Pred  <- predict(WineTree, WinesTest)

# Performance Indicators for Original Tree
# Calculate the MSE for the original tree
mse <- mean((as.numeric(WinesTest$Pred) - as.numeric(WinesTest$Quality))^2)
cat("\n\nMSE of the Original Tree is", mse)

## 
## 
## MSE of the Original Tree is 0.7192

cat("\nRMSE of the Original Tree is", sqrt(mse), "\n\n")

## 
## RMSE of the Original Tree is 0.8481

cat("\n\nThe Confusion Matrix explains the following details:\n")

## 
## 
## The Confusion Matrix explains the following details:

cat("- Accuracy: The proportion of true results (both true positives and true negatives) among the total number of cases examined.\n")

## - Accuracy: The proportion of true results (both true positives and true negatives) among the total number of cases examined.

cat("- Kappa: A statistic that measures inter-rater agreement for categorical items. It is generally thought to be a more robust measure than simple percent agreement calculation since it takes into account the agreement occurring by chance.\n")

## - Kappa: A statistic that measures inter-rater agreement for categorical items. It is generally thought to be a more robust measure than simple percent agreement calculation since it takes into account the agreement occurring by chance.

cat("- No Information Rate (NIR): The accuracy that could be achieved by always predicting the most frequent class.\n")

## - No Information Rate (NIR): The accuracy that could be achieved by always predicting the most frequent class.

cat("- Pos Pred Value (Precision): The proportion of positive results in statistics and machine learning that are true positive results.\n")

## - Pos Pred Value (Precision): The proportion of positive results in statistics and machine learning that are true positive results.

cat("- Neg Pred Value: The proportion of negative results that are true negative results.\n")

## - Neg Pred Value: The proportion of negative results that are true negative results.

cat("- Prevalence: The proportion of the dataset that is positive.\n")

## - Prevalence: The proportion of the dataset that is positive.

cat("- Detection Rate: The rate at which positive instances are detected.\n")

## - Detection Rate: The rate at which positive instances are detected.

cat("- Detection Prevalence: The proportion of predicted positives.\n")

## - Detection Prevalence: The proportion of predicted positives.

cat("- Balanced Accuracy: The average of sensitivity and specificity.\n\n")

## - Balanced Accuracy: The average of sensitivity and specificity.

confMat <- confusionMatrix((WinesTest$Pred), (WinesTest$Quality))
confMat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   2   9  11  10   3   0   0
##          5   4  21 233 133  12   1   1
##          6   1  17 158 375  85  18   0
##          7   1   3  12  60  98  14   0
##          8   0   1   2   3   9   3   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                             
##                Accuracy : 0.552             
##                  95% CI : (0.525, 0.58)     
##     No Information Rate : 0.447             
##     P-Value [Acc > NIR] : 0.0000000000000164
##                                             
##                   Kappa : 0.312             
##                                             
##  Mcnemar's Test P-Value : NA                
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000  0.17647    0.560    0.645   0.4734  0.08333
## Specificity           1.00000  0.97918    0.805    0.612   0.9177  0.98813
## Pos Pred Value            NaN  0.25714    0.575    0.573   0.5213  0.16667
## Neg Pred Value        0.99385  0.96680    0.796    0.681   0.9020  0.97426
## Prevalence            0.00615  0.03923    0.320    0.447   0.1592  0.02769
## Detection Rate        0.00000  0.00692    0.179    0.288   0.0754  0.00231
## Detection Prevalence  0.00000  0.02692    0.312    0.503   0.1446  0.01385
## Balanced Accuracy     0.50000  0.57783    0.683    0.629   0.6955  0.53573
##                      Class: 9
## Sensitivity          0.000000
## Specificity          1.000000
## Pos Pred Value            NaN
## Neg Pred Value       0.999231
## Prevalence           0.000769
## Detection Rate       0.000000
## Detection Prevalence 0.000000
## Balanced Accuracy    0.500000

Sensitivity <- confMat$byClass['Sensitivity']
Specificity <- confMat$byClass['Specificity']

3.17 Showing Performance of Test Data After Winnowing and Boosting

# Save Predictions in WinesTest
WinesTest$PredW <- predict(WineTreeW, WinesTest)

# Performance Indicators for Winnowed/Boosted Tree
# Calculate the MSE for the Winnowed/Boosted tree
mse <- mean((as.numeric(WinesTest$PredW) - as.numeric(WinesTest$Quality))^2)
cat("\n\nMSE of the Winnowed/Boosted Tree is", mse)

## 
## 
## MSE of the Winnowed/Boosted Tree is 0.5523

cat("\nRMSE of the Winnowed/Boosted Tree is", sqrt(mse), "\n\n")

## 
## RMSE of the Winnowed/Boosted Tree is 0.7432

confMat <- confusionMatrix((WinesTest$PredW), (WinesTest$Quality))
confMat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   3   9  10   2   0   0   0
##          5   4  27 264 113   6   0   1
##          6   0  12 131 415  92  14   0
##          7   1   2  10  48 103   9   0
##          8   0   1   1   3   6  13   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                              
##                Accuracy : 0.618              
##                  95% CI : (0.591, 0.645)     
##     No Information Rate : 0.447              
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.41               
##                                              
##  Mcnemar's Test P-Value : NA                 
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000  0.17647    0.635    0.714   0.4976   0.3611
## Specificity           1.00000  0.98799    0.829    0.654   0.9360   0.9913
## Pos Pred Value            NaN  0.37500    0.636    0.625   0.5954   0.5417
## Neg Pred Value        0.99385  0.96708    0.828    0.739   0.9077   0.9820
## Prevalence            0.00615  0.03923    0.320    0.447   0.1592   0.0277
## Detection Rate        0.00000  0.00692    0.203    0.319   0.0792   0.0100
## Detection Prevalence  0.00000  0.01846    0.319    0.511   0.1331   0.0185
## Balanced Accuracy     0.50000  0.58223    0.732    0.684   0.7168   0.6762
##                      Class: 9
## Sensitivity          0.000000
## Specificity          1.000000
## Pos Pred Value            NaN
## Neg Pred Value       0.999231
## Prevalence           0.000769
## Detection Rate       0.000000
## Detection Prevalence 0.000000
## Balanced Accuracy    0.500000

Sensitivity <- confMat$byClass['Sensitivity']
Specificity <- confMat$byClass['Specificity']

4 Comparing Prediction Performance of Linear Model and CART Model

4.1 Calculate Linear Regression Model for all Wines

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(as.character(Wines$Quality))

# Run Linear Regression with all Xs
LinReg09 <- lm(QualityNum ~ FixAcid + VolAcid + CitricAcid + ResSugar + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol + Type, data = Wines)

summary(LinReg09)

## 
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + CitricAcid + ResSugar + 
##     Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + 
##     Alcohol + Type, data = Wines)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.780 -0.467 -0.044  0.456  3.021 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  104.751754   14.135540    7.41     0.00000000000014 ***
## FixAcid        0.085066    0.015764    5.40     0.00000007046421 ***
## VolAcid       -1.492406    0.081351  -18.35 < 0.0000000000000002 ***
## CitricAcid    -0.062621    0.079720   -0.79                0.432    
## ResSugar       0.062437    0.005934   10.52 < 0.0000000000000002 ***
## Chlorid       -0.757251    0.334445   -2.26                0.024 *  
## FreeSulf       0.004937    0.000766    6.44     0.00000000012535 ***
## TotalSulf     -0.001403    0.000324   -4.33     0.00001490646374 ***
## Density     -103.909624   14.335952   -7.25     0.00000000000047 ***
## PH             0.498756    0.090579    5.51     0.00000003805505 ***
## Sulphates      0.721744    0.076243    9.47 < 0.0000000000000002 ***
## Alcohol        0.222670    0.018074   12.32 < 0.0000000000000002 ***
## TypeWhite     -0.361332    0.056753   -6.37     0.00000000020631 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.733 on 6484 degrees of freedom
## Multiple R-squared:  0.297,  Adjusted R-squared:  0.295 
## F-statistic:  228 on 12 and 6484 DF,  p-value: <0.0000000000000002

anova(LinReg09)

## Analysis of Variance Table
## 
## Response: QualityNum
##              Df Sum Sq Mean Sq F value               Pr(>F)    
## FixAcid       1     29      29   54.29     0.00000000000019 ***
## VolAcid       1    322     322  599.75 < 0.0000000000000002 ***
## CitricAcid    1      0       0    0.65                 0.42    
## ResSugar      1     42      42   78.17 < 0.0000000000000002 ***
## Chlorid       1     63      63  116.73 < 0.0000000000000002 ***
## FreeSulf      1      1       1    2.35                 0.13    
## TotalSulf     1    188     188  349.95 < 0.0000000000000002 ***
## Density       1    339     339  630.09 < 0.0000000000000002 ***
## PH            1    184     184  343.28 < 0.0000000000000002 ***
## Sulphates     1    140     140  261.06 < 0.0000000000000002 ***
## Alcohol       1    138     138  256.38 < 0.0000000000000002 ***
## Type          1     22      22   40.54     0.00000000020631 ***
## Residuals  6484   3485       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

vif(LinReg09)

##    FixAcid    VolAcid CitricAcid   ResSugar    Chlorid   FreeSulf  TotalSulf 
##      5.048      2.168      1.622      9.635      1.659      2.236      4.046 
##    Density         PH  Sulphates    Alcohol       Type 
##     22.337      2.564      1.556      5.617      7.224

4.2 Calculate Linear Regression Model for all Wines

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)

# Run Linear Regression with all Xs
LinReg10 <- lm(QualityNum ~ FixAcid + VolAcid + CitricAcid + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol + Type, data = Wines)

summary(LinReg10)

## 
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + CitricAcid + Chlorid + 
##     FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol + 
##     Type, data = Wines)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.010 -0.467 -0.038  0.467  3.154 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -33.502639   5.715140   -5.86   0.0000000047955380 ***
## FixAcid      -0.031785   0.011282   -2.82               0.0049 ** 
## VolAcid      -1.560064   0.081779  -19.08 < 0.0000000000000002 ***
## CitricAcid   -0.085521   0.080362   -1.06               0.2873    
## Chlorid      -1.209213   0.334468   -3.62               0.0003 ***
## FreeSulf      0.006134   0.000764    8.03   0.0000000000000012 ***
## TotalSulf    -0.001679   0.000325   -5.16   0.0000002539355998 ***
## Density      34.590746   5.726875    6.04   0.0000000016257380 ***
## PH           -0.080357   0.072546   -1.11               0.2680    
## Sulphates     0.534056   0.074751    7.14   0.0000000000010023 ***
## Alcohol       0.361938   0.012411   29.16 < 0.0000000000000002 ***
## TypeWhite    -0.062943   0.049574   -1.27               0.2042    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.739 on 6485 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.283 
## F-statistic:  234 on 11 and 6485 DF,  p-value: <0.0000000000000002

anova(LinReg10)

## Analysis of Variance Table
## 
## Response: QualityNum
##              Df Sum Sq Mean Sq F value               Pr(>F)    
## FixAcid       1     29      29   53.38   0.0000000000003076 ***
## VolAcid       1    322     322  589.77 < 0.0000000000000002 ***
## CitricAcid    1      0       0    0.64                 0.42    
## Chlorid       1     56      56  103.36 < 0.0000000000000002 ***
## FreeSulf      1     12      12   21.96   0.0000028340998068 ***
## TotalSulf     1    223     223  408.45 < 0.0000000000000002 ***
## Density       1    182     182  333.84 < 0.0000000000000002 ***
## PH            1     36      36   64.96   0.0000000000000009 ***
## Sulphates     1     80      80  147.05 < 0.0000000000000002 ***
## Alcohol       1    467     467  853.85 < 0.0000000000000002 ***
## Type          1      1       1    1.61                 0.20    
## Residuals  6485   3544       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

vif(LinReg10)

##    FixAcid    VolAcid CitricAcid    Chlorid   FreeSulf  TotalSulf    Density 
##      2.543      2.155      1.621      1.632      2.186      4.019      3.505 
##         PH  Sulphates    Alcohol       Type 
##      1.617      1.471      2.604      5.421

4.3 Calculate Linear Regression Model for all Wines

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)

# Run Linear Regression with all Xs
LinReg11 <- lm(QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol, data = Wines)

summary(LinReg11)

## 
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + 
##     TotalSulf + Density + PH + Sulphates + Alcohol, data = Wines)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.974 -0.466 -0.038  0.463  3.135 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -34.879569   5.523150   -6.32    0.000000000287744 ***
## FixAcid      -0.031637   0.010248   -3.09              0.00203 ** 
## VolAcid      -1.481112   0.067980  -21.79 < 0.0000000000000002 ***
## Chlorid      -1.173701   0.322055   -3.64              0.00027 ***
## FreeSulf      0.006314   0.000754    8.37 < 0.0000000000000002 ***
## TotalSulf    -0.001967   0.000265   -7.42    0.000000000000137 ***
## Density      35.787370   5.567088    6.43    0.000000000138138 ***
## PH           -0.044325   0.068967   -0.64              0.52044    
## Sulphates     0.554740   0.072280    7.67    0.000000000000019 ***
## Alcohol       0.360322   0.012273   29.36 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.739 on 6487 degrees of freedom
## Multiple R-squared:  0.284,  Adjusted R-squared:  0.283 
## F-statistic:  286 on 9 and 6487 DF,  p-value: <0.0000000000000002

anova(LinReg11)

## Analysis of Variance Table
## 
## Response: QualityNum
##             Df Sum Sq Mean Sq F value               Pr(>F)    
## FixAcid      1     29      29    53.4   0.0000000000003088 ***
## VolAcid      1    322     322   589.7 < 0.0000000000000002 ***
## Chlorid      1     57      57   103.8 < 0.0000000000000002 ***
## FreeSulf     1     12      12    21.2   0.0000042706747999 ***
## TotalSulf    1    213     213   389.7 < 0.0000000000000002 ***
## Density      1    187     187   342.8 < 0.0000000000000002 ***
## PH           1     33      33    60.8   0.0000000000000073 ***
## Sulphates    1     83      83   152.2 < 0.0000000000000002 ***
## Alcohol      1    471     471   861.9 < 0.0000000000000002 ***
## Residuals 6487   3546       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.4 Calculate Linear Regression Model for all Wines

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)

# Run Linear Regression with all Xs
LinReg12 <- lm(QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + TotalSulf + Density + Sulphates + Alcohol, data = Wines)

summary(LinReg12)

## 
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + 
##     TotalSulf + Density + Sulphates + Alcohol, data = Wines)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.992 -0.466 -0.039  0.463  3.131 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -34.121460   5.395477   -6.32    0.000000000271688 ***
## FixAcid      -0.028545   0.009049   -3.15              0.00162 ** 
## VolAcid      -1.488211   0.067074  -22.19 < 0.0000000000000002 ***
## Chlorid      -1.159040   0.321232   -3.61              0.00031 ***
## FreeSulf      0.006310   0.000754    8.37 < 0.0000000000000002 ***
## TotalSulf    -0.001932   0.000260   -7.44    0.000000000000111 ***
## Density      34.877025   5.383638    6.48    0.000000000099540 ***
## Sulphates     0.546221   0.071051    7.69    0.000000000000017 ***
## Alcohol       0.358852   0.012058   29.76 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.739 on 6488 degrees of freedom
## Multiple R-squared:  0.284,  Adjusted R-squared:  0.283 
## F-statistic:  322 on 8 and 6488 DF,  p-value: <0.0000000000000002

anova(LinReg12)

## Analysis of Variance Table
## 
## Response: QualityNum
##             Df Sum Sq Mean Sq F value               Pr(>F)    
## FixAcid      1     29      29    53.4     0.00000000000031 ***
## VolAcid      1    322     322   589.7 < 0.0000000000000002 ***
## Chlorid      1     57      57   103.8 < 0.0000000000000002 ***
## FreeSulf     1     12      12    21.2     0.00000426640941 ***
## TotalSulf    1    213     213   389.8 < 0.0000000000000002 ***
## Density      1    187     187   342.8 < 0.0000000000000002 ***
## Sulphates    1    103     103   188.9 < 0.0000000000000002 ***
## Alcohol      1    484     484   885.7 < 0.0000000000000002 ***
## Residuals 6488   3546       1                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5 Prepare Data for Pie Chart

# Get the ANOVA table
anova_table <- anova(LinReg12)

# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`

# Prepare data for the pie chart
SSdf <- data.frame(
  Component = rownames(anova_table),
  SumSq = SumSq
)

# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]

# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq) 

# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")

# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)

SSdf

##   Component   SumSq Percent                   Labels
## 1 Residuals 3546.12 71.59 % Residuals\n3546\n71.59 %
## 2   Alcohol  484.11  9.77 %     Alcohol\n484\n9.77 %
## 3   VolAcid  322.33  6.51 %     VolAcid\n322\n6.51 %
## 4 TotalSulf  213.04   4.3 %    TotalSulf\n213\n4.3 %
## 5   Density  187.36  3.78 %     Density\n187\n3.78 %
## 6 Sulphates  103.23  2.08 %   Sulphates\n103\n2.08 %
## 7   Chlorid   56.74  1.15 %      Chlorid\n57\n1.15 %
## 8   FixAcid   29.17  0.59 %      FixAcid\n29\n0.59 %
## 9  FreeSulf   11.57  0.23 %     FreeSulf\n12\n0.23 %

4.6 Show Pie Chart of Sum of Squares for Regression Model

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )

# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 30 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Wine Data from Portugal",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

4.7 Show Pie Chart of Sum of Squares for Regression Model Without Residuals

# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )

# Create the pie chart
ggplot(SSdf[-c(1),], aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
  geom_bar(stat = "identity", width = 1) +
  geom_label(aes(label = Labels, fill_alpha("white", 0)),
             position = position_stack(vjust = 0.5),
             show.legend = FALSE) +
  coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
  scale_fill_manual(values = Colours)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 0),
    legend.position = 'right',
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 13),
    axis.text.x=element_blank(), #remove x axis labels
    axis.ticks.x=element_blank(), #remove x axis ticks
    axis.text.y=element_blank(),  #remove y axis labels
    axis.ticks.y=element_blank()  #remove y axis ticks
  ) +
  labs(title = "Sum of Squares from Regression Model Summary",
       subtitle = "Based on Wine Data from Portugal",
       caption = "Data source: Public Domain",
       fill = "Components",
       x = "", 
       y = "")

4.8 Show Pareto Chart of Sum of Squares for Regression Model

# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)

# Prepare Table
Pareto <- (SSdf$SumSq)
names(Pareto) <- SSdf$Component

# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5  # Adjust x-position if needed
line_y <- SSdf$CumSum

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )

# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
                     ylab = "Sum of Squares", # label y left
 
  # colors of the chart            
  col = Colours,
   
  # ranges of the percentages at the right
  cumperc = seq(0, 100, by = 20),
   
  # label y right
  ylab2 = "Cumulative Percentage",
   
  # title of the chart
  main     = "Sum of Squares from Regression Model Summary",
  subtitle = "Based on Cereal Data",
  
  # font size
  cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)

##            
## Pareto chart analysis for Pareto
##             Frequency Cum.Freq. Percentage Cum.Percent.
##   Residuals 3546.1198 3546.1198    71.5855      71.5855
##   Alcohol    484.1102 4030.2301     9.7727      81.3582
##   VolAcid    322.3276 4352.5577     6.5068      87.8650
##   TotalSulf  213.0370 4565.5947     4.3006      92.1656
##   Density    187.3637 4752.9583     3.7823      95.9479
##   Sulphates  103.2340 4856.1923     2.0840      98.0319
##   Chlorid     56.7437 4912.9360     1.1455      99.1774
##   FixAcid     29.1748 4942.1109     0.5890      99.7663
##   FreeSulf    11.5748 4953.6857     0.2337     100.0000

# Plot the points on the cumulative percentage line
lines(line_x, line_y, pch = 30, col = "red") 

# Add text labels to the cumulative percentage line
text(line_x, line_y, labels = round(CumSum * 100, 1), pos = 3, col = "red")

4.9 Show Pareto Chart of Sum of Squares for Regression Model without Residuals

# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)

# Prepare Table
Pareto <- SSdf[-c(1),]$SumSq
names(Pareto) <- SSdf[-c(1),]$Component

# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5  # Adjust x-position if needed
line_y <- SSdf$CumSum

# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )

# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
                     ylab = "Sum of Squares", # label y left
 
  # colors of the chart            
  col = Colours,
   
  # ranges of the percentages at the right
  cumperc = seq(0, 100, by = 20),
   
  # label y right
  ylab2 = "Cumulative Percentage",
   
  # title of the chart
  main     = "Sum of Squares from Regression Model Summary",
  subtitle = "Based on Cereal Data",
  
  # font size
  cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)

##            
## Pareto chart analysis for Pareto
##             Frequency Cum.Freq. Percentage Cum.Percent.
##   Alcohol    484.1102  484.1102    34.3934      34.3934
##   VolAcid    322.3276  806.4379    22.8996      57.2931
##   TotalSulf  213.0370 1019.4748    15.1351      72.4282
##   Density    187.3637 1206.8385    13.3112      85.7394
##   Sulphates  103.2340 1310.0725     7.3342      93.0736
##   Chlorid     56.7437 1366.8162     4.0313      97.1050
##   FixAcid     29.1748 1395.9910     2.0727      99.1777
##   FreeSulf    11.5748 1407.5659     0.8223     100.0000

4.10 Show Pareto Chart of Sum of Squares for Regression Model without Residuals using ggplot

# Calculate the frequency and cumulative percentage for each Type
SSdf$SumSq    <- round(SSdf$SumSq, digits = 0)
SSdf$Percent  <- as.numeric(gsub(" %", "", SSdf$Percent))
SSdf$CumPerc  <- cumsum(SSdf$Percent)

# Scale the cumulative percentage values to match the height of the first bar
SSdf$CumPercScaled <- SSdf$CumPerc * sum(SSdf$SumSq) / 100  # Scale CumPerc to the same range as SumSq

# Define y-axis breaks for alignment
primary_breaks   <- seq(0, max(SSdf$CumPercScaled) + 200, by = 500)  # Grid for primary axis
secondary_breaks <- seq(0, 105, by = 10)                       # Grid for secondary axis

# Plot the Pareto chart
Plot <- ggplot(SSdf, aes(x = reorder(Component, -SumSq), y = SumSq)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # Bars for frequency
  geom_text(aes(label = SumSq), vjust = -2.0, size = 4) +  # Labels for bars
  geom_line(aes(y = CumPerc * sum(SumSq) / 100, group = 1),  # Line for cumulative %
            color = "red", size = 1) +
  geom_point(aes(y = CumPerc * sum(SumSq) / 100), color = "red") +  # Points on line
  
  # Labels for cumulative percentage (Corrected Scaling)
  geom_text(aes(
    y = CumPercScaled, 
    label = sprintf("%.1f%%", CumPerc)
  ), color = "red", vjust = -0.5, size = 4) +
  
  # **Corrected Secondary Axis Scaling**
  scale_y_continuous(
    breaks = primary_breaks,  # Primary y-axis grid
    sec.axis = sec_axis(
      ~ . * 100 / sum(SSdf$SumSq),  # Correct transformation for secondary axis
      name = "Cumulative Percentage", 
      breaks = secondary_breaks  # Secondary y-axis grid
    )
  ) +  
  
  labs(title = "Pareto Chart of Sum of Squares - Importance of Factors",
       x = NULL,
       y = "Frequency") +
  theme_minimal() +  # Ensure gridlines are clearly visible
  theme(
    axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    panel.grid.major.y = element_line(color = "gray", size = 0.5),  # Adjust grid style
    panel.grid.minor.y = element_blank()  # Remove minor grid lines
  )


# Print the plot
print(Plot)

4.11 Prepare Predictions for Linear Regression and Classification and Regression Tree

# Save Predictions in WinesTrain
Wines$PredCart  <- predict(WineTree, Wines)
Wines$PredCartW <- predict(WineTreeW, Wines)
Wines$PredLin   <- predict(LinReg12, Wines)


RMSEcart <- sqrt(mean((as.numeric(as.character(Wines$PredCart))  - as.numeric(as.character(Wines$Quality))) ^2))
RMSEcartW <- sqrt(mean((as.numeric(as.character(Wines$PredCartW)) - as.numeric(as.character(Wines$Quality))) ^2))
RMSElin <- sqrt(mean((as.numeric(as.character(Wines$PredLin))   - as.numeric(as.character(Wines$Quality))) ^2))

cat("\n\nRMSE of Wine Model with CART: ", RMSEcart)

## 
## 
## RMSE of Wine Model with CART:  0.5859

cat("\n\nRMSE of Wine Model with CART after Winnowing/Boosting: ", RMSEcartW)

## 
## 
## RMSE of Wine Model with CART after Winnowing/Boosting:  0.3794

cat("\n\nRMSE of Wine Model with Linear Regression: ", RMSElin)

## 
## 
## RMSE of Wine Model with Linear Regression:  2.132

Data Mining - Predictive ModellingModel Evaluation Techniques

Updated by Uwe H Kaufmann on 02 Feb 2025, 19:50