Install and load required packages for model evaluation and visualisation. This ensures all necessary libraries are available before running the analysis. Packages include tools for data processing, visualization, and regression diagnostics.
# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "pROC" %in% installed.packages()) { install.packages("pROC", dependencies = TRUE) }
library(pROC)
if(! "caret" %in% installed.packages()) { install.packages("caret", dependencies = TRUE) }
library(caret)
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
# Global Settings
options(digits = 4)
options(scipen = 999)
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
Load the dataset from an Excel file. Ensure the dataset is in the correct format and contains the necessary variables. The dataset should include predictor and response variables for regression analysis.
# Download Training Data from URL
# Download Test Data from URL
Characteristics of the dataset are shown together with descriptive statistics.
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
Cereals$ID <- 1:nrow(Cereals)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Cereals)
## [1] "name" "mfr" "type" "calories" "protein" "fat"
## [7] "sodium" "fiber" "carbo" "sugars" "potass" "vitamins"
## [13] "shelf" "weight" "cups" "rating" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Cereals)
## 'data.frame': 77 obs. of 17 variables:
## $ name : Factor w/ 77 levels "100% Bran","100% Natural Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ mfr : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
## $ type : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 -1 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Cereals, 5)
## name mfr type calories protein fat sodium fiber carbo
## 1 100% Bran N C 70 4 1 130 10 5
## 2 100% Natural Bran Q C 120 3 5 15 2 8
## 3 All-Bran K C 70 4 1 260 9 7
## 4 All-Bran with Extra Fiber K C 50 4 0 140 14 8
## 5 Almond Delight R C 110 2 2 200 1 14
## sugars potass vitamins shelf weight cups rating ID
## 1 6 280 25 3 1 0.33 68.40 1
## 2 8 135 0 3 1 1.00 33.98 2
## 3 5 320 25 3 1 0.33 59.43 3
## 4 0 330 25 3 1 0.50 93.70 4
## 5 8 -1 25 3 1 0.75 34.38 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Cereals[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max
## 1 mfr 77
## 2 ... A 1 1.299%
## 3 ... G 22 28.571%
## 4 ... K 23 29.87%
## 5 ... N 6 7.792%
## 6 ... P 9 11.688%
## 7 ... Q 8 10.39%
## 8 ... R 8 10.39%
## 9 type 77
## 10 ... C 74 96.104%
## 11 ... H 3 3.896%
## 12 calories 77 106.88 19.484 50 100 110 110 160
## 13 protein 77 2.5455 1.0948 1 2 3 3 6
## 14 fat 77 1.013 1.0065 0 0 1 2 5
## 15 sodium 77 159.68 83.832 0 130 180 210 320
## 16 fiber 77 2.1519 2.3834 0 1 2 3 14
## 17 carbo 77 14.597 4.279 -1 12 14 17 23
## 18 sugars 76 7.0263 4.3787 0 3 7 11 15
## 19 potass 77 96.078 71.287 -1 40 90 120 330
## 20 vitamins 77 28.247 22.343 0 25 25 25 100
## 21 shelf 77 2.2078 0.83252 1 1 2 3 3
## 22 weight 77 1.0296 0.15048 0.5 1 1 1 1.5
## 23 cups 77 0.82104 0.23272 0.25 0.67 0.75 1 1.5
## 24 rating 77 42.666 14.047 18.043 33.174 40.4 50.828 93.705
## 25 ID 77 39 22.372 1 20 39 58 77
## NA Mode
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## 18 0.012987012987013
## 19 0
## 20 0
## 21 0
## 22 0
## 23 0
## 24 0
## 25 0
# Show number of Blanks
cat("\n\nNumber of Blanks per Column:\n")
##
##
## Number of Blanks per Column:
sapply(Cereals, function(x) sum(x == "" | x == "NA"))
## name mfr type calories protein fat sodium fiber
## 0 0 0 0 0 0 0 0
## carbo sugars potass vitamins shelf weight cups rating
## 0 NA 0 0 0 0 0 0
## ID
## 0
Check for missing values and handle them appropriately. Missing values can impact model performance, so we need to decide whether to impute or remove them. Ensure that categorical variables are converted to factors for proper modeling.
# Define the function to replace NAs with the mean
NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
# Apply the function across all columns, grouped by CategoryColumn
Cereals <- Cereals %>%
# group_by(Country) %>%
mutate(across(everything(), NA2mean))
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Cereals[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 mfr 77
## 2 ... A 1 1.299%
## 3 ... G 22 28.571%
## 4 ... K 23 29.87%
## 5 ... N 6 7.792%
## 6 ... P 9 11.688%
## 7 ... Q 8 10.39%
## 8 ... R 8 10.39%
## 9 type 77
## 10 ... C 74 96.104%
## 11 ... H 3 3.896%
## 12 calories 77 106.88 19.484 50 100 110 110 160 0
## 13 protein 77 2.5455 1.0948 1 2 3 3 6 0
## 14 fat 77 1.013 1.0065 0 0 1 2 5 0
## 15 sodium 77 159.68 83.832 0 130 180 210 320 0
## 16 fiber 77 2.1519 2.3834 0 1 2 3 14 0
## 17 carbo 77 14.597 4.279 -1 12 14 17 23 0
## 18 sugars 77 7.0263 4.3498 0 3 7 11 15 0
## 19 potass 77 96.078 71.287 -1 40 90 120 330 0
## 20 vitamins 77 28.247 22.343 0 25 25 25 100 0
## 21 shelf 77 2.2078 0.83252 1 1 2 3 3 0
## 22 weight 77 1.0296 0.15048 0.5 1 1 1 1.5 0
## 23 cups 77 0.82104 0.23272 0.25 0.67 0.75 1 1.5 0
## 24 rating 77 42.666 14.047 18.043 33.174 40.4 50.828 93.705 0
## 25 ID 77 39 22.372 1 20 39 58 77 0
# Write Data to WD
write.csv(Cereals, file = "CerealsClean.csv")
# Loading other packages if not available
if(! "mltools" %in% installed.packages()) { install.packages("mltools", dependencies = TRUE, INSTALL_opts = '--no-lock') }
library(mltools)
if(! "data.table" %in% installed.packages()) { install.packages("data.table", dependencies = TRUE, INSTALL_opts = '--no-lock') }
library(data.table)
# Convert categorical variables to factors
Cereals <- one_hot(as.data.table(Cereals[, -1]), )
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Cereals, 5)
## mfr_A mfr_G mfr_K mfr_N mfr_P mfr_Q mfr_R type_C type_H calories protein
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <num> <num>
## 1: 0 0 0 1 0 0 0 1 0 70 4
## 2: 0 0 0 0 0 1 0 1 0 120 3
## 3: 0 0 1 0 0 0 0 1 0 70 4
## 4: 0 0 1 0 0 0 0 1 0 50 4
## 5: 0 0 0 0 0 0 1 1 0 110 2
## fat sodium fiber carbo sugars potass vitamins shelf weight cups rating
## <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
## 1: 1 130 10 5 6 280 25 3 1 0.33 68.40
## 2: 5 15 2 8 8 135 0 3 1 1.00 33.98
## 3: 1 260 9 7 5 320 25 3 1 0.33 59.43
## 4: 0 140 14 8 0 330 25 3 1 0.50 93.70
## 5: 2 200 1 14 8 -1 25 3 1 0.75 34.38
## ID
## <num>
## 1: 1
## 2: 2
## 3: 3
## 4: 4
## 5: 5
cat("\n\nList of Columns (Variables):\n")
##
##
## List of Columns (Variables):
names(Cereals)
## [1] "mfr_A" "mfr_G" "mfr_K" "mfr_N" "mfr_P" "mfr_Q"
## [7] "mfr_R" "type_C" "type_H" "calories" "protein" "fat"
## [13] "sodium" "fiber" "carbo" "sugars" "potass" "vitamins"
## [19] "shelf" "weight" "cups" "rating" "ID"
# Write Data to WD
write.csv(Cereals, file = "CerealsClean.csv")
# Loading other packages if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
# Convert X and Y in Numeric Columns
# Make Column Names short
Cereals <- data.frame(Cereals)
names(Cereals) <- str_to_title(names(Cereals))
names(Cereals) <- substr(names(Cereals), 1, 9)
# Split data into train and test
set.seed(123)
split <- initial_split(Cereals, prop = 0.80)
CerealsTrain <- training(split)
CerealsTest <- testing(split)
# Select numeric columns for correlation matrix
CerealNum <- data.frame(select_if(CerealsTrain[, -23], is.numeric))
# Show Split Data
cat("\nShow Structure of the Training Data Frame:\n")
##
## Show Structure of the Training Data Frame:
cat("Training Data Frame includes ", nrow(CerealsTrain), " rows and ", ncol(CerealsTrain), " columns.")
## Training Data Frame includes 61 rows and 23 columns.
cat("\n\nShow Structure of the Testing Data Frame:\n")
##
##
## Show Structure of the Testing Data Frame:
cat("Testing Data Frame includes ", nrow(CerealsTest), " rows and ", ncol(CerealsTest), " columns.")
## Testing Data Frame includes 16 rows and 23 columns.
cat("\n\nShow Columns of the Training Data Frame:\n")
##
##
## Show Columns of the Training Data Frame:
names(Cereals)
## [1] "Mfr_a" "Mfr_g" "Mfr_k" "Mfr_n" "Mfr_p" "Mfr_q"
## [7] "Mfr_r" "Type_c" "Type_h" "Calories" "Protein" "Fat"
## [13] "Sodium" "Fiber" "Carbo" "Sugars" "Potass" "Vitamins"
## [19] "Shelf" "Weight" "Cups" "Rating" "Id"
# Plot Correlation Matrix of Numeric Values
corrMat <- cor(CerealNum)
# Display results
round(corrMat, 2)
## Mfr_a Mfr_g Mfr_k Mfr_n Mfr_p Mfr_q Mfr_r Type_c Type_h Calories
## Mfr_a 1.00 -0.09 -0.08 -0.04 -0.05 -0.04 -0.04 -0.70 0.70 -0.05
## Mfr_g -0.09 1.00 -0.40 -0.20 -0.26 -0.22 -0.22 0.12 -0.12 0.23
## Mfr_k -0.08 -0.40 1.00 -0.18 -0.23 -0.20 -0.20 0.11 -0.11 0.11
## Mfr_n -0.04 -0.20 -0.18 1.00 -0.12 -0.10 -0.10 -0.28 0.28 -0.37
## Mfr_p -0.05 -0.26 -0.23 -0.12 1.00 -0.13 -0.13 0.07 -0.07 0.07
## Mfr_q -0.04 -0.22 -0.20 -0.10 -0.13 1.00 -0.11 0.06 -0.06 -0.33
## Mfr_r -0.04 -0.22 -0.20 -0.10 -0.13 -0.11 1.00 0.06 -0.06 0.09
## Type_c -0.70 0.12 0.11 -0.28 0.07 0.06 0.06 1.00 -1.00 0.08
## Type_h 0.70 -0.12 -0.11 0.28 -0.07 -0.06 -0.06 -1.00 1.00 -0.08
## Calories -0.05 0.23 0.11 -0.37 0.07 -0.33 0.09 0.08 -0.08 1.00
## Protein 0.19 -0.09 0.05 0.11 -0.01 -0.07 -0.02 -0.18 0.18 0.07
## Fat 0.01 0.31 -0.26 -0.25 -0.03 0.08 0.08 0.09 -0.09 0.51
## Sodium -0.26 0.31 0.10 -0.44 -0.09 -0.18 0.17 0.28 -0.28 0.42
## Fiber -0.12 -0.23 -0.03 0.38 0.20 -0.14 0.01 0.12 -0.12 -0.10
## Carbo 0.04 0.00 0.09 0.03 -0.19 -0.23 0.27 -0.18 0.18 0.16
## Sugars -0.12 0.16 0.09 -0.33 0.18 -0.10 -0.12 0.23 -0.23 0.52
## Potass 0.02 -0.01 -0.15 0.18 0.21 -0.14 -0.03 0.11 -0.11 0.11
## Vitamins -0.02 0.17 0.19 -0.26 -0.05 -0.18 -0.05 0.14 -0.14 0.31
## Shelf -0.02 -0.01 0.01 -0.12 0.12 0.16 -0.18 0.03 -0.03 0.11
## Weight -0.02 0.17 0.09 -0.11 0.14 -0.42 -0.04 0.02 -0.02 0.76
## Cups 0.09 0.15 0.05 -0.06 -0.25 -0.03 0.04 -0.12 0.12 -0.06
## Rating 0.14 -0.40 -0.02 0.59 -0.02 0.05 0.03 -0.26 0.26 -0.64
## Protein Fat Sodium Fiber Carbo Sugars Potass Vitamins Shelf Weight
## Mfr_a 0.19 0.01 -0.26 -0.12 0.04 -0.12 0.02 -0.02 -0.02 -0.02
## Mfr_g -0.09 0.31 0.31 -0.23 0.00 0.16 -0.01 0.17 -0.01 0.17
## Mfr_k 0.05 -0.26 0.10 -0.03 0.09 0.09 -0.15 0.19 0.01 0.09
## Mfr_n 0.11 -0.25 -0.44 0.38 0.03 -0.33 0.18 -0.26 -0.12 -0.11
## Mfr_p -0.01 -0.03 -0.09 0.20 -0.19 0.18 0.21 -0.05 0.12 0.14
## Mfr_q -0.07 0.08 -0.18 -0.14 -0.23 -0.10 -0.14 -0.18 0.16 -0.42
## Mfr_r -0.02 0.08 0.17 0.01 0.27 -0.12 -0.03 -0.05 -0.18 -0.04
## Type_c -0.18 0.09 0.28 0.12 -0.18 0.23 0.11 0.14 0.03 0.02
## Type_h 0.18 -0.09 -0.28 -0.12 0.18 -0.23 -0.11 -0.14 -0.03 -0.02
## Calories 0.07 0.51 0.42 -0.10 0.16 0.52 0.11 0.31 0.11 0.76
## Protein 1.00 0.15 0.01 0.48 0.07 -0.32 0.52 0.04 0.13 0.24
## Fat 0.15 1.00 0.12 0.09 -0.26 0.25 0.23 0.05 0.30 0.26
## Sodium 0.01 0.12 1.00 -0.13 0.38 0.03 -0.05 0.34 -0.14 0.37
## Fiber 0.48 0.09 -0.13 1.00 -0.29 -0.05 0.89 -0.04 0.30 0.34
## Carbo 0.07 -0.26 0.38 -0.29 1.00 -0.60 -0.27 0.20 -0.18 0.11
## Sugars -0.32 0.25 0.03 -0.05 -0.60 1.00 0.08 0.13 0.09 0.43
## Potass 0.52 0.23 -0.05 0.89 -0.27 0.08 1.00 0.07 0.36 0.50
## Vitamins 0.04 0.05 0.34 -0.04 0.20 0.13 0.07 1.00 0.28 0.35
## Shelf 0.13 0.30 -0.14 0.30 -0.18 0.09 0.36 0.28 1.00 0.19
## Weight 0.24 0.26 0.37 0.34 0.11 0.43 0.50 0.35 0.19 1.00
## Cups -0.21 -0.33 0.11 -0.50 0.36 -0.10 -0.45 0.12 -0.39 -0.21
## Rating 0.49 -0.40 -0.45 0.48 0.22 -0.76 0.28 -0.28 0.01 -0.32
## Cups Rating
## Mfr_a 0.09 0.14
## Mfr_g 0.15 -0.40
## Mfr_k 0.05 -0.02
## Mfr_n -0.06 0.59
## Mfr_p -0.25 -0.02
## Mfr_q -0.03 0.05
## Mfr_r 0.04 0.03
## Type_c -0.12 -0.26
## Type_h 0.12 0.26
## Calories -0.06 -0.64
## Protein -0.21 0.49
## Fat -0.33 -0.40
## Sodium 0.11 -0.45
## Fiber -0.50 0.48
## Carbo 0.36 0.22
## Sugars -0.10 -0.76
## Potass -0.45 0.28
## Vitamins 0.12 -0.28
## Shelf -0.39 0.01
## Weight -0.21 -0.32
## Cups 1.00 -0.09
## Rating -0.09 1.00
# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)
cat("\n\nShow Correlation Plot of Cereal Data:\n")
##
##
## Show Correlation Plot of Cereal Data:
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
Create a multiple regression model using the lm() function. The response variable should be continuous, and predictors should be selected based on domain knowledge. Check for assumptions of regression, including linearity and multicollinearity.
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Run Linear Regression with all Xs
LinReg01 <- lm(Rating ~ Mfr_g + Mfr_k + Mfr_n + Mfr_p + Mfr_q + Mfr_r + Calories + Protein + Fat + Sodium + Fiber + Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
summary(LinReg01)
##
## Call:
## lm(formula = Rating ~ Mfr_g + Mfr_k + Mfr_n + Mfr_p + Mfr_q +
## Mfr_r + Calories + Protein + Fat + Sodium + Fiber + Carbo +
## Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0000005432 -0.0000002190 -0.0000000067 0.0000001784 0.0000005201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.927184076586 0.000000574968 95530922.79 <0.0000000000000002
## Mfr_g 0.000000070142 0.000000393572 0.18 0.86
## Mfr_k -0.000000019625 0.000000436983 -0.04 0.96
## Mfr_n 0.000000000697 0.000000462933 0.00 1.00
## Mfr_p 0.000000258209 0.000000424684 0.61 0.55
## Mfr_q 0.000000038642 0.000000405081 0.10 0.92
## Mfr_r -0.000000106761 0.000000468450 -0.23 0.82
## Calories -0.222724168805 0.000000008860 -25138927.43 <0.0000000000000002
## Protein 3.273173867642 0.000000060806 53829736.69 <0.0000000000000002
## Fat -1.691407935680 0.000000097071 -17424498.05 <0.0000000000000002
## Sodium -0.054492702474 0.000000000760 -71658380.96 <0.0000000000000002
## Fiber 3.443479867529 0.000000095657 35998084.37 <0.0000000000000002
## Carbo 1.092450961999 0.000000047682 22911291.53 <0.0000000000000002
## Sugars -0.724895126352 0.000000043386 -16708145.91 <0.0000000000000002
## Potass -0.033993352893 0.000000002332 -14578389.20 <0.0000000000000002
## Vitamins -0.051211966321 0.000000002392 -21407941.11 <0.0000000000000002
## Shelf -0.000000102398 0.000000065294 -1.57 0.12
## Weight -0.000000102594 0.000000846602 -0.12 0.90
## Cups 0.000000246336 0.000000238248 1.03 0.31
##
## (Intercept) ***
## Mfr_g
## Mfr_k
## Mfr_n
## Mfr_p
## Mfr_q
## Mfr_r
## Calories ***
## Protein ***
## Fat ***
## Sodium ***
## Fiber ***
## Carbo ***
## Sugars ***
## Potass ***
## Vitamins ***
## Shelf
## Weight
## Cups
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000000304 on 42 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.22e+15 on 18 and 42 DF, p-value: <0.0000000000000002
anova(LinReg01)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Mfr_g 1 1644 1644 17826911792018684.00 <0.0000000000000002 ***
## Mfr_k 1 408 408 4424468649701570.00 <0.0000000000000002 ***
## Mfr_n 1 2443 2443 26496108344241260.00 <0.0000000000000002 ***
## Mfr_p 1 51 51 558420728560620.50 <0.0000000000000002 ***
## Mfr_q 1 3 3 36273542372380.66 <0.0000000000000002 ***
## Mfr_r 1 135 135 1461432841867144.75 <0.0000000000000002 ***
## Calories 1 1806 1806 19590988157889628.00 <0.0000000000000002 ***
## Protein 1 2188 2188 23728118829838116.00 <0.0000000000000002 ***
## Fat 1 58 58 632887772742377.62 <0.0000000000000002 ***
## Sodium 1 100 100 1085712742772386.25 <0.0000000000000002 ***
## Fiber 1 43 43 468880954465192.94 <0.0000000000000002 ***
## Carbo 1 1291 1291 14002347678624214.00 <0.0000000000000002 ***
## Sugars 1 71 71 771674488825328.00 <0.0000000000000002 ***
## Potass 1 26 26 276754550243907.62 <0.0000000000000002 ***
## Vitamins 1 49 49 534286553581679.00 <0.0000000000000002 ***
## Shelf 1 0 0 3.64 0.063 .
## Weight 1 0 0 0.16 0.691
## Cups 1 0 0 1.07 0.307
## Residuals 42 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Perform diagnostic checks to evaluate model assumptions. Check for normality of residuals, homoscedasticity, and multicollinearity. Use visualizations and statistical tests to validate assumptions
# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Calculating VIF
vif_values <- vif(LinReg01)
cat("\n\nShow List of VIF for all Factors:\n")
##
##
## Show List of VIF for all Factors:
vif_values
## Mfr_g Mfr_k Mfr_n Mfr_p Mfr_q Mfr_r Calories Protein
## 21.976 24.443 10.668 13.595 9.627 12.875 14.734 2.997
## Fat Sodium Fiber Carbo Sugars Potass Vitamins Shelf
## 4.992 2.588 21.808 21.219 24.680 13.623 1.559 1.858
## Weight Cups
## 10.243 2.039
names(CerealNum)
## [1] "Mfr_a" "Mfr_g" "Mfr_k" "Mfr_n" "Mfr_p" "Mfr_q"
## [7] "Mfr_r" "Type_c" "Type_h" "Calories" "Protein" "Fat"
## [13] "Sodium" "Fiber" "Carbo" "Sugars" "Potass" "Vitamins"
## [19] "Shelf" "Weight" "Cups" "Rating"
# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)
# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(1:9)]
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
LinReg02 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Fiber + Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
summary(LinReg02)
##
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Fiber +
## Carbo + Sugars + Potass + Vitamins + Shelf + Weight + Cups,
## data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0000005394 -0.0000001912 0.0000000198 0.0000001762 0.0000005301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.927184197908 0.000000423301 129759058.33 <0.0000000000000002
## Calories -0.222724166150 0.000000008204 -27147569.85 <0.0000000000000002
## Protein 3.273173839209 0.000000057083 57340144.61 <0.0000000000000002
## Fat -1.691407979275 0.000000083479 -20261427.86 <0.0000000000000002
## Sodium -0.054492702426 0.000000000584 -93246016.59 <0.0000000000000002
## Fiber 3.443479823430 0.000000057110 60296059.78 <0.0000000000000002
## Carbo 1.092450932970 0.000000040634 26885005.95 <0.0000000000000002
## Sugars -0.724895147880 0.000000037884 -19134619.22 <0.0000000000000002
## Potass -0.033993351852 0.000000001729 -19655615.39 <0.0000000000000002
## Vitamins -0.051211966647 0.000000002320 -22074483.90 <0.0000000000000002
## Shelf -0.000000088573 0.000000059633 -1.49 0.14
## Weight 0.000000206905 0.000000686180 0.30 0.76
## Cups 0.000000224078 0.000000228385 0.98 0.33
##
## (Intercept) ***
## Calories ***
## Protein ***
## Fat ***
## Sodium ***
## Fiber ***
## Carbo ***
## Sugars ***
## Potass ***
## Vitamins ***
## Shelf
## Weight
## Cups
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0000003 on 48 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 9.56e+15 on 12 and 48 DF, p-value: <0.0000000000000002
anova(LinReg02)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories 1 4235 4235 47084416123530368.00 <0.0000000000000002 ***
## Protein 1 2998 2998 33326560615337632.00 <0.0000000000000002 ***
## Fat 1 271 271 3008445072310896.50 <0.0000000000000002 ***
## Sodium 1 455 455 5053205036681088.00 <0.0000000000000002 ***
## Fiber 1 325 325 3611960742822140.50 <0.0000000000000002 ***
## Carbo 1 1845 1845 20508484998727288.00 <0.0000000000000002 ***
## Sugars 1 89 89 984525405604054.75 <0.0000000000000002 ***
## Potass 1 50 50 550839227939786.75 <0.0000000000000002 ***
## Vitamins 1 53 53 584443708100659.00 <0.0000000000000002 ***
## Shelf 1 0 0 3.37 0.073 .
## Weight 1 0 0 0.01 0.904
## Cups 1 0 0 0.96 0.331
## Residuals 48 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Calculating VIF
vif_values <- vif(LinReg02)
cat("\n\nShow List of VIF for Factors:\n")
##
##
## Show List of VIF for Factors:
vif_values
## Calories Protein Fat Sodium Fiber Carbo Sugars Potass
## 12.952 2.708 3.785 1.567 7.969 15.798 19.291 7.683
## Vitamins Shelf Weight Cups
## 1.503 1.589 6.898 1.921
# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)
# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(6)]
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
LinReg03 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Fiber + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
summary(LinReg03)
##
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Fiber +
## Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0964 -0.6892 -0.0974 0.5579 2.9248
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.49816 1.62373 34.18 < 0.0000000000000002 ***
## Calories -0.06443 0.02194 -2.94 0.0050 **
## Protein 2.42271 0.18250 13.28 < 0.0000000000000002 ***
## Fat -3.18399 0.23944 -13.30 < 0.0000000000000002 ***
## Sodium -0.05253 0.00223 -23.59 < 0.0000000000000002 ***
## Fiber 3.04367 0.21177 14.37 < 0.0000000000000002 ***
## Sugars -1.68882 0.04699 -35.94 < 0.0000000000000002 ***
## Potass -0.03305 0.00664 -4.98 0.0000084 ***
## Vitamins -0.04736 0.00889 -5.33 0.0000025 ***
## Shelf 0.04574 0.22894 0.20 0.8425
## Weight 7.42864 2.41231 3.08 0.0034 **
## Cups 1.91401 0.83345 2.30 0.0260 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.15 on 49 degrees of freedom
## Multiple R-squared: 0.994, Adjusted R-squared: 0.992
## F-statistic: 703 on 11 and 49 DF, p-value: <0.0000000000000002
anova(LinReg03)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories 1 4235 4235 3191.93 < 0.0000000000000002 ***
## Protein 1 2998 2998 2259.26 < 0.0000000000000002 ***
## Fat 1 271 271 203.95 < 0.0000000000000002 ***
## Sodium 1 455 455 342.56 < 0.0000000000000002 ***
## Fiber 1 325 325 244.86 < 0.0000000000000002 ***
## Sugars 1 1884 1884 1419.89 < 0.0000000000000002 ***
## Potass 1 33 33 24.74 0.0000085 ***
## Vitamins 1 35 35 26.33 0.0000049 ***
## Shelf 1 0 0 0.21 0.6479
## Weight 1 11 11 8.57 0.0052 **
## Cups 1 7 7 5.27 0.0260 *
## Residuals 49 65 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Calculating VIF
vif_values <- vif(LinReg03)
cat("\n\nShow List of VIF for Factors:\n")
##
##
## Show List of VIF for Factors:
vif_values
## Calories Protein Fat Sodium Fiber Sugars Potass Vitamins
## 6.282 1.877 2.111 1.542 7.428 2.012 7.679 1.498
## Shelf Weight Cups
## 1.588 5.780 1.734
# Load Library if not available
if(! "corrplot" %in% installed.packages()) { install.packages("corrplot", dependencies = TRUE) }
library(corrplot)
# Drop Columns from CerealNum
CerealNum <- CerealNum[, -c(5)]
# Plot Correlation Matrix of Numeric Values
corrplot.mixed(cor(CerealNum),
upper = "pie",
lower = "number",
addgrid.col = "black",
tl.col = "black")
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
LinReg04 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
summary(LinReg04)
##
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars +
## Potass + Vitamins + Shelf + Weight + Cups, data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.477 -1.436 -0.029 1.526 5.305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.27041 3.51296 17.73 < 0.0000000000000002 ***
## Calories -0.15150 0.04768 -3.18 0.0025 **
## Protein 2.50371 0.41240 6.07 0.00000016968177 ***
## Fat -3.39904 0.54027 -6.29 0.00000007698998 ***
## Sodium -0.05350 0.00503 -10.63 0.00000000000002 ***
## Sugars -1.74649 0.10585 -16.50 < 0.0000000000000002 ***
## Potass 0.04137 0.00940 4.40 0.00005670600108 ***
## Vitamins -0.05618 0.02006 -2.80 0.0072 **
## Shelf -0.17109 0.51646 -0.33 0.7418
## Weight 13.52251 5.36884 2.52 0.0150 *
## Cups -1.87535 1.78750 -1.05 0.2992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.6 on 50 degrees of freedom
## Multiple R-squared: 0.967, Adjusted R-squared: 0.961
## F-statistic: 147 on 10 and 50 DF, p-value: <0.0000000000000002
anova(LinReg04)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories 1 4235 4235 624.49 < 0.0000000000000002 ***
## Protein 1 2998 2998 442.02 < 0.0000000000000002 ***
## Fat 1 271 271 39.90 0.000000070271 ***
## Sodium 1 455 455 67.02 0.000000000086 ***
## Sugars 1 1415 1415 208.73 < 0.0000000000000002 ***
## Potass 1 483 483 71.24 0.000000000035 ***
## Vitamins 1 66 66 9.77 0.0030 **
## Shelf 1 0 0 0.00 0.9972
## Weight 1 49 49 7.20 0.0099 **
## Cups 1 7 7 1.10 0.2992
## Residuals 50 339 7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Library if not available
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Calculating VIF
vif_values <- vif(LinReg04)
cat("\n\nShow List of VIF for Factors:\n")
##
##
## Show List of VIF for Factors:
vif_values
## Calories Protein Fat Sodium Sugars Potass Vitamins Shelf
## 5.803 1.875 2.103 1.541 1.998 3.011 1.490 1.581
## Weight Cups
## 5.601 1.561
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Run Regression
LinReg05 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins + Weight, data = CerealsTrain)
summary(LinReg05)
##
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars +
## Potass + Vitamins + Weight, data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.789 -1.587 -0.068 1.385 5.272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.17314 2.71925 22.13 < 0.0000000000000002 ***
## Calories -0.15872 0.04675 -3.40 0.0013 **
## Protein 2.51268 0.40689 6.18 0.0000001016848212 ***
## Fat -3.27124 0.50378 -6.49 0.0000000317633263 ***
## Sodium -0.05339 0.00483 -11.06 0.0000000000000029 ***
## Sugars -1.74005 0.10462 -16.63 < 0.0000000000000002 ***
## Potass 0.04264 0.00901 4.73 0.0000174863141909 ***
## Vitamins -0.06168 0.01799 -3.43 0.0012 **
## Weight 14.27110 5.27415 2.71 0.0092 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.58 on 52 degrees of freedom
## Multiple R-squared: 0.966, Adjusted R-squared: 0.961
## F-statistic: 187 on 8 and 52 DF, p-value: <0.0000000000000002
anova(LinReg05)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories 1 4235 4235 635.47 < 0.0000000000000002 ***
## Protein 1 2998 2998 449.79 < 0.0000000000000002 ***
## Fat 1 271 271 40.60 0.00000004954 ***
## Sodium 1 455 455 68.20 0.00000000005 ***
## Sugars 1 1415 1415 212.40 < 0.0000000000000002 ***
## Potass 1 483 483 72.49 0.00000000002 ***
## Vitamins 1 66 66 9.94 0.0027 **
## Weight 1 49 49 7.32 0.0092 **
## Residuals 52 347 7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculating VIF
vif_values <- vif(LinReg05)
cat("\n\nShow List of VIF for Factors:\n")
##
##
## Show List of VIF for Factors:
vif_values
## Calories Protein Fat Sodium Sugars Potass Vitamins Weight
## 5.675 1.857 1.860 1.443 1.986 2.816 1.220 5.500
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
LinReg06 <- lm(Rating ~ Calories + Protein + Fat + Sodium + Sugars + Potass + Vitamins, data = CerealsTrain)
summary(LinReg06)
##
## Call:
## lm(formula = Rating ~ Calories + Protein + Fat + Sodium + Sugars +
## Potass + Vitamins, data = CerealsTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.040 -2.050 -0.098 1.609 6.485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.13419 2.63362 23.97 < 0.0000000000000002 ***
## Calories -0.06406 0.03280 -1.95 0.0561 .
## Protein 2.50596 0.43047 5.82 0.000000347773263 ***
## Fat -3.90541 0.47179 -8.28 0.000000000040370 ***
## Sodium -0.05111 0.00503 -10.16 0.000000000000048 ***
## Sugars -1.71224 0.11015 -15.54 < 0.0000000000000002 ***
## Potass 0.05885 0.00712 8.26 0.000000000043202 ***
## Vitamins -0.05638 0.01892 -2.98 0.0043 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.73 on 53 degrees of freedom
## Multiple R-squared: 0.962, Adjusted R-squared: 0.957
## F-statistic: 190 on 7 and 53 DF, p-value: <0.0000000000000002
anova(LinReg06)
## Analysis of Variance Table
##
## Response: Rating
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories 1 4235 4235 567.75 < 0.0000000000000002 ***
## Protein 1 2998 2998 401.86 < 0.0000000000000002 ***
## Fat 1 271 271 36.28 0.000000166435 ***
## Sodium 1 455 455 60.93 0.000000000229 ***
## Sugars 1 1415 1415 189.76 < 0.0000000000000002 ***
## Potass 1 483 483 64.76 0.000000000094 ***
## Vitamins 1 66 66 8.88 0.0043 **
## Residuals 53 395 7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculating VIF
vif_values <- vif(LinReg06)
vif_values
## Calories Protein Fat Sodium Sugars Potass Vitamins
## 2.496 1.857 1.458 1.400 1.967 1.572 1.205
# Get the ANOVA table
anova_table <- anova(LinReg06)
# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`
# Prepare data for the pie chart
SSdf <- data.frame(
Component = rownames(anova_table),
SumSq = SumSq
)
# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]
# Summarize the last three rows
summary_row <- SSdf %>%
slice_tail(n = 2) %>%
summarise(Component = "Fat+Vitamin", SumSq = sum(SumSq))
# Drop the last three rows from the original data frame
Rows <- nrow(SSdf) - 2
SSdf <- SSdf[1:Rows, ]
# Add the summary row to the modified data frame
SSdf <- bind_rows(SSdf, summary_row)
# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq)
# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")
# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)
SSdf
## Component SumSq Percent Labels
## 1 Calories 4235.0 41.05 % Calories\n4235\n41.05 %
## 2 Protein 2997.6 29.05 % Protein\n2998\n29.05 %
## 3 Sugars 1415.5 13.72 % Sugars\n1415\n13.72 %
## 4 Potass 483.1 4.68 % Potass\n483\n4.68 %
## 5 Sodium 454.5 4.41 % Sodium\n455\n4.41 %
## 6 Residuals 395.3 3.83 % Residuals\n395\n3.83 %
## 7 Fat+Vitamin 336.9 3.26 % Fat+Vitamin\n337\n3.26 %
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the pie chart
ggplot(SSdf, aes(x = reorder(Component, -SumSq), y = SumSq, label = Labels, fill = reorder(Component, -SumSq), desc = TRUE)) +
geom_col(stat = "identity", fill = Colours) +
geom_text(aes(
label = Labels,
colour = "black",
position = position_dodge(width = 1)) +
scale_fill_manual(values = Colours)) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)
# Prepare Table
Pareto <- (SSdf$SumSq)
names(Pareto) <- SSdf$Component
# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5 # Adjust x-position if needed
line_y <- SSdf$CumSum
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
ylab = "Sum of Squares", # label y left
# colors of the chart
col = Colours,
# ranges of the percentages at the right
cumperc = seq(0, 100, by = 20),
# label y right
ylab2 = "Cumulative Percentage",
# title of the chart
main = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
# font size
cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)
##
## Pareto chart analysis for Pareto
## Frequency Cum.Freq. Percentage Cum.Percent.
## Calories 4235.020 4235.020 41.045 41.045
## Protein 2997.566 7232.586 29.052 70.098
## Sugars 1415.494 8648.080 13.719 83.816
## Potass 483.082 9131.162 4.682 88.498
## Sodium 454.512 9585.674 4.405 92.904
## Residuals 395.343 9981.017 3.832 96.735
## Fat+Vitamin 336.863 10317.880 3.265 100.000
# Plot the points on the cumulative percentage line
lines(line_x, line_y, pch = 30, col = "red")
# Add text labels to the cumulative percentage line
text(line_x, line_y, labels = round(CumSum * 100, 1), pos = 3, col = "red")
# Make predictions on the testing data
predictions <- predict(LinReg06, newdata = CerealsTest)
Use performance metrics such as R-squared, RMSE, and MAE to assess model accuracy. Compare predicted values with actual values to measure predictive power. These metrics help in comparing models and improving predictions.
# Calculate RMSE
predictions <- predict(LinReg06, newdata = CerealsTrain)
actual <- CerealsTrain$Rating
RMSE <- sqrt(mean((predictions - actual)^2))
cat("\n\nRMSE of Cereal Train: ", RMSE)
##
##
## RMSE of Cereal Train: 2.546
predictions <- predict(LinReg06, newdata = CerealsTest)
actual <- CerealsTest$Rating
RMSE <- sqrt(mean((predictions - actual)^2))
cat("\n\nRMSE of Cereal Test : ", RMSE)
##
##
## RMSE of Cereal Test : 3.937
Plot actual vs. predicted values to assess model fit visually. Ideally, points should align closely with the diagonal line, indicating good predictive performance. This visualisation helps to detect systematic errors and trends.
# Add Model Data to CerealsTrain
CerealsTrain$Resid <- LinReg06$residuals
CerealsTrain$Fit <- LinReg06$fitted.values
MeanRes <- mean(CerealsTrain$Resid)
SDRes <- sd(CerealsTrain$Resid)
CerealsTrain$StdRes <- (CerealsTrain$Resid - MeanRes) / SDRes
# Set up a 2x2 plotting environment
par(mfrow = c(2, 2))
# Residuals vs Fitted
plot(CerealsTrain$Fit, CerealsTrain$StdRes, main = "Std Residuals vs Fitted", xlab = "Fitted values", ylab = "Std Residuals")
abline(h = 0, col = "red")
# Normal Q-Q
qqnorm(LinReg06$residuals, main = "Normal Q-Q")
qqline(LinReg06$residuals, col = "red")
# Residuals vs Time (ID)
plot(CerealsTrain$Id, CerealsTrain$StdRes, main = "Residuals over Time (Sequence of data)", xlab = "Observation Order (ID)", ylab = "Residuals")
abline(h = 0, col = "red")
# Calculating Cook's Distance
cutoff <- 4/(nrow(CerealsTrain)-length(LinReg06$coefficients)-2)
# Plot Cook's Distance for each observation
plot(LinReg06, which = 4, cook.levels = cutoff)
abline(h = cutoff, lty = 2, col = "red")
# Reset plotting environment
par(mfrow = c(1, 1))
# Download Data from URL
# Collapse Categrories
levels(Adults$marital.status)[2:4] <- "Married"
levels(Adults$workclass)[c(2,3,8)] <- "Gov"
levels(Adults$workclass)[c(5, 6)] <- "Self"
# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
# Prepare Columns
names(Adults) <- str_to_title(names(Adults))
names(Adults) <- substr(names(Adults), 1, 6)
colnames(Adults)[2] <- "Class"
colnames(Adults)[5] <- "EduNum"
colnames(Adults)[6] <- "Marital"
colnames(Adults)[7] <- "Occupat"
colnames(Adults)[8] <- "Relationship"
colnames(Adults)[11] <- "CapGain"
colnames(Adults)[12] <- "CapLoss"
colnames(Adults)[13] <- "WeekHours"
# Change Column
Adults$Income <- ifelse(Adults$Income == ">50K", "Earns >50k", "Earns <=50k")
# Show 4 Rows of Dataset
head(Adults, 4)
## Age Class Fnlwgt Educat EduNum Marital Occupat
## 1 25 Private 226802 11th 7 Never-married Machine-op-inspct
## 2 38 Private 89814 HS-grad 9 Married Farming-fishing
## 3 28 Gov 336951 Assoc-acdm 12 Married Protective-serv
## 4 44 Private 160323 Some-college 10 Married Machine-op-inspct
## Relationship Race Gender CapGain CapLoss WeekHours Native Income
## 1 Own-child Black Male 0 0 40 United-States Earns <=50k
## 2 Husband White Male 0 0 50 United-States Earns <=50k
## 3 Husband White Male 0 0 40 United-States Earns >50k
## 4 Husband Black Male 7688 0 40 United-States Earns >50k
# Write data to working directory
write.csv(Adults, file = "Adults.csv")
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Adults)
## [1] "Age" "Class" "Fnlwgt" "Educat" "EduNum"
## [6] "Marital" "Occupat" "Relationship" "Race" "Gender"
## [11] "CapGain" "CapLoss" "WeekHours" "Native" "Income"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Adults)
## 'data.frame': 48842 obs. of 15 variables:
## $ Age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ Class : Factor w/ 6 levels "?","Gov","Never-worked",..: 4 4 2 4 1 4 1 5 4 4 ...
## $ Fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ Educat : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ EduNum : int 7 9 12 10 10 6 9 15 10 4 ...
## $ Marital : Factor w/ 5 levels "Divorced","Married",..: 3 2 2 2 3 3 3 2 3 2 ...
## $ Occupat : Factor w/ 15 levels "?","Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
## $ Relationship: Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ Race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ CapGain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ CapLoss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ WeekHours : int 40 50 40 40 30 30 40 32 40 10 ...
## $ Native : Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 40 ...
## $ Income : chr "Earns <=50k" "Earns <=50k" "Earns >50k" "Earns >50k" ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Adults, 5)
## Age Class Fnlwgt Educat EduNum Marital Occupat
## 1 25 Private 226802 11th 7 Never-married Machine-op-inspct
## 2 38 Private 89814 HS-grad 9 Married Farming-fishing
## 3 28 Gov 336951 Assoc-acdm 12 Married Protective-serv
## 4 44 Private 160323 Some-college 10 Married Machine-op-inspct
## 5 18 ? 103497 Some-college 10 Never-married ?
## Relationship Race Gender CapGain CapLoss WeekHours Native Income
## 1 Own-child Black Male 0 0 40 United-States Earns <=50k
## 2 Husband White Male 0 0 50 United-States Earns <=50k
## 3 Husband White Male 0 0 40 United-States Earns >50k
## 4 Husband Black Male 7688 0 40 United-States Earns >50k
## 5 Own-child White Female 0 0 30 United-States Earns <=50k
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Adults, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50
## 1 Age 48842 38.644 13.711 17 28 37
## 2 Class 48842
## 3 ... ? 2799 5.731%
## 4 ... Gov 6549 13.409%
## 5 ... Never-worked 10 0.02%
## 6 ... Private 33906 69.42%
## 7 ... Self 5557 11.378%
## 8 ... Without-pay 21 0.043%
## 9 Fnlwgt 48842 189664 105604 12285 117551 178145
## 10 Educat 48842
## 11 ... 10th 1389 2.844%
## 12 ... 11th 1812 3.71%
## 13 ... 12th 657 1.345%
## 14 ... 1st-4th 247 0.506%
## 15 ... 5th-6th 509 1.042%
## 16 ... 7th-8th 955 1.955%
## 17 ... 9th 756 1.548%
## 18 ... Assoc-acdm 1601 3.278%
## 19 ... Assoc-voc 2061 4.22%
## 20 ... Bachelors 8025 16.431%
## 21 ... Doctorate 594 1.216%
## 22 ... HS-grad 15784 32.316%
## 23 ... Masters 2657 5.44%
## 24 ... Preschool 83 0.17%
## 25 ... Prof-school 834 1.708%
## 26 ... Some-college 10878 22.272%
## 27 EduNum 48842 10.078 2.571 1 9 10
## 28 Marital 48842
## 29 ... Divorced 6633 13.581%
## 30 ... Married 23044 47.181%
## 31 ... Never-married 16117 32.998%
## 32 ... Separated 1530 3.133%
## 33 ... Widowed 1518 3.108%
## 34 Occupat 48842
## 35 ... ? 2809 5.751%
## 36 ... Adm-clerical 5611 11.488%
## 37 ... Armed-Forces 15 0.031%
## 38 ... Craft-repair 6112 12.514%
## 39 ... Exec-managerial 6086 12.461%
## 40 ... Farming-fishing 1490 3.051%
## 41 ... Handlers-cleaners 2072 4.242%
## 42 ... Machine-op-inspct 3022 6.187%
## 43 ... Other-service 4923 10.079%
## 44 ... Priv-house-serv 242 0.495%
## 45 ... Prof-specialty 6172 12.637%
## 46 ... Protective-serv 983 2.013%
## 47 ... Sales 5504 11.269%
## 48 ... Tech-support 1446 2.961%
## 49 ... Transport-moving 2355 4.822%
## 50 Relationship 48842
## 51 ... Husband 19716 40.367%
## 52 ... Not-in-family 12583 25.763%
## 53 ... Other-relative 1506 3.083%
## 54 ... Own-child 7581 15.521%
## 55 ... Unmarried 5125 10.493%
## 56 ... Wife 2331 4.773%
## 57 Race 48842
## 58 ... Amer-Indian-Eskimo 470 0.962%
## 59 ... Asian-Pac-Islander 1519 3.11%
## 60 ... Black 4685 9.592%
## 61 ... Other 406 0.831%
## 62 ... White 41762 85.504%
## 63 Gender 48842
## 64 ... Female 16192 33.152%
## 65 ... Male 32650 66.848%
## 66 CapGain 48842 1079.1 7452 0 0 0
## 67 CapLoss 48842 87.502 403 0 0 0
## 68 WeekHours 48842 40.422 12.391 1 40 40
## 69 Native 48842
## 70 ... ? 857 1.755%
## 71 ... Cambodia 28 0.057%
## 72 ... Canada 182 0.373%
## 73 ... China 122 0.25%
## 74 ... Columbia 85 0.174%
## 75 ... Cuba 138 0.283%
## 76 ... Dominican-Republic 103 0.211%
## 77 ... Ecuador 45 0.092%
## 78 ... El-Salvador 155 0.317%
## 79 ... England 127 0.26%
## 80 ... France 38 0.078%
## 81 ... Germany 206 0.422%
## 82 ... Greece 49 0.1%
## 83 ... Guatemala 88 0.18%
## 84 ... Haiti 75 0.154%
## 85 ... Holand-Netherlands 1 0.002%
## 86 ... Honduras 20 0.041%
## 87 ... Hong 30 0.061%
## 88 ... Hungary 19 0.039%
## 89 ... India 151 0.309%
## 90 ... Iran 59 0.121%
## 91 ... Ireland 37 0.076%
## 92 ... Italy 105 0.215%
## 93 ... Jamaica 106 0.217%
## 94 ... Japan 92 0.188%
## 95 ... Laos 23 0.047%
## 96 ... Mexico 951 1.947%
## 97 ... Nicaragua 49 0.1%
## 98 ... Outlying-US(Guam-USVI-etc) 23 0.047%
## 99 ... Peru 46 0.094%
## 100 ... Philippines 295 0.604%
## 101 ... Poland 87 0.178%
## 102 ... Portugal 67 0.137%
## 103 ... Puerto-Rico 184 0.377%
## 104 ... Scotland 21 0.043%
## 105 ... South 115 0.235%
## 106 ... Taiwan 65 0.133%
## 107 ... Thailand 30 0.061%
## 108 ... Trinadad&Tobago 27 0.055%
## 109 ... United-States 43832 89.742%
## 110 ... Vietnam 86 0.176%
## 111 ... Yugoslavia 23 0.047%
## 112 Income 48842
## 113 ... Earns <=50k 37155 76.072%
## 114 ... Earns >50k 11687 23.928%
## P75 Max NA Mode
## 1 48 90 0
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 237642 1490400 0
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27 12 16 0
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
## 51
## 52
## 53
## 54
## 55
## 56
## 57
## 58
## 59
## 60
## 61
## 62
## 63
## 64
## 65
## 66 0 99999 0
## 67 0 4356 0
## 68 45 99 0
## 69
## 70
## 71
## 72
## 73
## 74
## 75
## 76
## 77
## 78
## 79
## 80
## 81
## 82
## 83
## 84
## 85
## 86
## 87
## 88
## 89
## 90
## 91
## 92
## 93
## 94
## 95
## 96
## 97
## 98
## 99
## 100
## 101
## 102
## 103
## 104
## 105
## 106
## 107
## 108
## 109
## 110
## 111
## 112
## 113
## 114
# Standardise Numeric Variables
Adults$Age.z <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
Adults$EduNum.z <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
Adults$CapGain.z <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
Adults$CapLoss.z <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Adults)
## [1] "Age" "Class" "Fnlwgt" "Educat" "EduNum"
## [6] "Marital" "Occupat" "Relationship" "Race" "Gender"
## [11] "CapGain" "CapLoss" "WeekHours" "Native" "Income"
## [16] "Age.z" "EduNum.z" "CapGain.z" "CapLoss.z" "WeekHours.z"
# Split data into train and test
set.seed(123)
split <- initial_split(Adults, prop = 0.80)
AdultsTrain <- training(split)
AdultsTest <- testing(split)
# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)
# Calculate the Income Tree
IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "class")
print(IncomeTree)
## n= 39073
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 39073 9401 Earns <=50k (0.75940 0.24060)
## 2) Marital=Divorced,Never-married,Separated,Widowed 20577 1311 Earns <=50k (0.93629 0.06371)
## 4) CapGain.z< 0.802 20208 957 Earns <=50k (0.95264 0.04736) *
## 5) CapGain.z>=0.802 369 15 Earns >50k (0.04065 0.95935) *
## 3) Marital=Married 18496 8090 Earns <=50k (0.56261 0.43739)
## 6) EduNum.z< 0.942 13047 4235 Earns <=50k (0.67540 0.32460)
## 12) CapGain.z< 0.539 12408 3608 Earns <=50k (0.70922 0.29078)
## 24) EduNum.z< -0.6138 2155 222 Earns <=50k (0.89698 0.10302) *
## 25) EduNum.z>=-0.6138 10253 3386 Earns <=50k (0.66976 0.33024)
## 50) CapLoss.z< 4.363 9842 3076 Earns <=50k (0.68746 0.31254) *
## 51) CapLoss.z>=4.363 411 101 Earns >50k (0.24574 0.75426) *
## 13) CapGain.z>=0.539 639 12 Earns >50k (0.01878 0.98122) *
## 7) EduNum.z>=0.942 5449 1594 Earns >50k (0.29253 0.70747) *
# Plot the Decsion Tree
cat("\n\nThe following information is shown in each node:\n")
##
##
## The following information is shown in each node:
cat("- Node Number\n")
## - Node Number
cat("- Predicted class\n")
## - Predicted class
cat("- Predicted probability of this class\n")
## - Predicted probability of this class
cat("- Percentage of Observations in the node\n")
## - Percentage of Observations in the node
rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?",
under = FALSE,
type = 2,
extra = 106,
clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
nn = TRUE,
fallen.leaves = TRUE,
digits = 3,
box.palette = "RdGn"
)
# Checking the order of variable importance
IncomeTree$variable.importance
## Marital CapGain.z EduNum.z Gender Age.z WeekHours.z
## 2720.3 1444.1 1310.8 928.9 745.8 399.3
## Class CapLoss.z
## 255.6 165.7
# Calculate Predicted Values
AdultsTest$Pred = predict(IncomeTree, AdultsTest, type = "class")
# Fit the model on the training set
set.seed(123)
IncomeTree <- train(
Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "rpart", trControl = trainControl("cv", number = 10), tuneLength = 10
)
# Plot model accuracy vs different values of cp (complexity parameter)
plot(IncomeTree,
main = "Model Accuracy vs Different Values of cp (Model Complexity)",
pch = 16,
col = "blue"
)
cat("\n\nThe complexity parameter (cp) plays a crucial role in controlling the size of the decision tree.\n")
##
##
## The complexity parameter (cp) plays a crucial role in controlling the size of the decision tree.
cat("A smaller cp value results in a more complex tree with more splits. A larger cp value results in a simpler tree with fewer splits.\n\n")
## A smaller cp value results in a more complex tree with more splits. A larger cp value results in a simpler tree with fewer splits.
# Print the best tuning parameter cp that maximizes the model accuracy
cat("\n\nThe Best Tuning Parameter cp that Maximizes Model Accuracy is:")
##
##
## The Best Tuning Parameter cp that Maximizes Model Accuracy is:
IncomeTree$bestTune
## cp
## 1 0.0009573
# Plot the final tree model
# par(xpd = NA) # Avoid clipping the text in some device
# plot(IncomeTree2$finalModel)
# text(IncomeTree2$finalModel, digits = 3)
IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain, method = "class")
# Decision rules in the model
rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?",
under = FALSE,
type = 2,
extra = 106,
clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
nn = TRUE,
fallen.leaves = TRUE,
digits = 3,
box.palette = "RdGn"
)
# Evaluate the Performance of the Classification Tree
Predicted <- AdultsTest$Pred
Actual <- AdultsTest$Income
table(Predicted, Actual)
## Actual
## Predicted Earns <=50k Earns >50k
## Earns <=50k 7076 1048
## Earns >50k 407 1238
# Compute model accuracy rate on test data
ModAcc <- mean(Predicted == Actual)
cat("\n\nThe overall accuracy of our tree model is", ModAcc)
##
##
## The overall accuracy of our tree model is 0.8511
# Check Whether Pruning Reduces the Error Rate
printcp(IncomeTree)
##
## Classification tree:
## rpart(formula = Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z +
## WeekHours.z + Race + Gender + Class + Marital, data = AdultsTrain,
## method = "class")
##
## Variables actually used in tree construction:
## [1] CapGain.z CapLoss.z EduNum.z Marital
##
## Root node error: 9401/39073 = 0.24
##
## n= 39073
##
## CP nsplit rel error xerror xstd
## 1 0.120 0 1.00 1.00 0.0090
## 2 0.065 2 0.76 0.76 0.0081
## 3 0.036 3 0.69 0.69 0.0078
## 4 0.011 4 0.66 0.66 0.0077
## 5 0.010 6 0.64 0.64 0.0076
# Explicitly request the lowest cp value
LowCP <- IncomeTree$cptable[which.min(IncomeTree$cptable[,"xerror"]),"CP"]
cat("\n\nThe Lowest Complexity Parameter (CP) is", LowCP, ".\n")
##
##
## The Lowest Complexity Parameter (CP) is 0.01 .
# Applying the pruned tree, the tree with the lowest cp value, to the model that was built using the training data set
bestcp <- IncomeTree$cptable[which.min(IncomeTree$cptable[,"xerror"]),"CP"]
PrunedTree <- prune(IncomeTree, cp = bestcp)
rpart.plot(PrunedTree, main = "Classification Tree: Is a Person Likely to Earn 50k?",
under = FALSE,
type = 2,
extra = 106,
clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
nn = TRUE,
fallen.leaves = TRUE,
digits = 3,
box.palette = "RdGn"
)
# Alternate specification
AdultsTest$PrunedPred = predict(PrunedTree, AdultsTest, type = "class")
# Evaluate the Performance of the Pruned Tree
PrunedPred <- AdultsTest$PrunedPred
table(PrunedPred, Actual)
## Actual
## PrunedPred Earns <=50k Earns >50k
## Earns <=50k 7076 1048
## Earns >50k 407 1238
# Download Wine Data from URL
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
## [13] "type"
# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
# Prepare Column Names
colnames(Wines)[1] <- "FixAcid"
colnames(Wines)[2] <- "VolAcid"
colnames(Wines)[3] <- "CitricAcid"
colnames(Wines)[4] <- "ResSugar"
colnames(Wines)[5] <- "Chlorid"
colnames(Wines)[6] <- "FreeSulf"
colnames(Wines)[7] <- "TotalSulf"
colnames(Wines)[8] <- "Density"
colnames(Wines)[9] <- "PH"
colnames(Wines)[10] <- "Sulphates"
colnames(Wines)[11] <- "Alcohol"
colnames(Wines)[12] <- "Quality"
colnames(Wines)[13] <- "Type"
# Add ID Column
Wines$ID <- 1:nrow(Wines)
# Show 4 Rows of Dataset
head(Wines, 4)
## FixAcid VolAcid CitricAcid ResSugar Chlorid FreeSulf TotalSulf Density PH
## 1 7.0 0.27 0.36 20.7 0.045 45 170 1.0010 3.00
## 2 6.3 0.30 0.34 1.6 0.049 14 132 0.9940 3.30
## 3 8.1 0.28 0.40 6.9 0.050 30 97 0.9951 3.26
## 4 7.2 0.23 0.32 8.5 0.058 47 186 0.9956 3.19
## Sulphates Alcohol Quality Type ID
## 1 0.45 8.8 6 White 1
## 2 0.49 9.5 6 White 2
## 3 0.44 10.1 6 White 3
## 4 0.40 9.9 6 White 4
# Write data to working directory
write.csv(Wines, file = "Wines.csv")
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Wines)
## [1] "FixAcid" "VolAcid" "CitricAcid" "ResSugar" "Chlorid"
## [6] "FreeSulf" "TotalSulf" "Density" "PH" "Sulphates"
## [11] "Alcohol" "Quality" "Type" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Wines)
## 'data.frame': 6497 obs. of 14 variables:
## $ FixAcid : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ VolAcid : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ CitricAcid: num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ ResSugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ Chlorid : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ FreeSulf : num 45 14 30 47 47 30 30 45 14 28 ...
## $ TotalSulf : num 170 132 97 186 186 97 136 170 132 129 ...
## $ Density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ PH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ Sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ Alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ Quality : int 6 6 6 6 6 6 6 6 6 6 ...
## $ Type : chr "White" "White" "White" "White" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Wines, 5)
## FixAcid VolAcid CitricAcid ResSugar Chlorid FreeSulf TotalSulf Density PH
## 1 7.0 0.27 0.36 20.7 0.045 45 170 1.0010 3.00
## 2 6.3 0.30 0.34 1.6 0.049 14 132 0.9940 3.30
## 3 8.1 0.28 0.40 6.9 0.050 30 97 0.9951 3.26
## 4 7.2 0.23 0.32 8.5 0.058 47 186 0.9956 3.19
## 5 7.2 0.23 0.32 8.5 0.058 47 186 0.9956 3.19
## Sulphates Alcohol Quality Type ID
## 1 0.45 8.8 6 White 1
## 2 0.49 9.5 6 White 2
## 3 0.44 10.1 6 White 3
## 4 0.40 9.9 6 White 4
## 5 0.40 9.9 6 White 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Wines, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA
## 1 FixAcid 6497 7.2153 1.2964 3.8 6.4 7 7.7 15.9 0
## 2 VolAcid 6497 0.33967 0.16464 0.08 0.23 0.29 0.4 1.58 0
## 3 CitricAcid 6497 0.31863 0.14532 0 0.25 0.31 0.39 1.66 0
## 4 ResSugar 6497 5.4432 4.7578 0.6 1.8 3 8.1 65.8 0
## 5 Chlorid 6497 0.056034 0.035034 0.009 0.038 0.047 0.065 0.611 0
## 6 FreeSulf 6497 30.525 17.749 1 17 29 41 289 0
## 7 TotalSulf 6497 115.74 56.522 6 77 118 156 440 0
## 8 Density 6497 0.9947 0.0029987 0.98711 0.99234 0.99489 0.99699 1.039 0
## 9 PH 6497 3.2185 0.16079 2.72 3.11 3.21 3.32 4.01 0
## 10 Sulphates 6497 0.53127 0.14881 0.22 0.43 0.51 0.6 2 0
## 11 Alcohol 6497 10.492 1.1927 8 9.5 10.3 11.3 14.9 0
## 12 Quality 6497 5.8184 0.87326 3 5 6 6 9 0
## 13 Type 6497
## 14 ... Red 1599 24.611%
## 15 ... White 4898 75.389%
## 16 ID 6497 3249 1875.7 1 1625 3249 4873 6497 0
## Mode
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
RedMean <- mean(Wines$Quality[Wines$Type == "Red"])
WhiteMean <- mean(Wines$Quality[Wines$Type == "White"])
# Standardise Numeric Variables
# Adults$Age.z <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
# Adults$EduNum.z <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
# Adults$CapGain.z <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
# Adults$CapLoss.z <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
# Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Wines)
## [1] "FixAcid" "VolAcid" "CitricAcid" "ResSugar" "Chlorid"
## [6] "FreeSulf" "TotalSulf" "Density" "PH" "Sulphates"
## [11] "Alcohol" "Quality" "Type" "ID"
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF")
# Plot Result Variable
ggplot(Wines,
aes(factor(Quality))) +
geom_bar(fill = "#7c0c18") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'none',
legend.title = element_text(size = 16),
legend.text = element_text(size = 16),
# axis.text.x=element_blank(), #remove x axis labels
# axis.ticks.x=element_blank(), #remove x axis ticks
# axis.text.y=element_blank(), #remove y axis labels
#axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Distribution of Wine Quality",
subtitle = "Based on Real Data from Portugal",
caption = "Data source: Public Domain",
fill = "Components",
x = "Quality Score",
y = "Count of Wines")
# Load Library if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
# Build BoxPlot
boxplot(Quality ~ Type, data = Wines,
main = "Wine Quality by Type",
xlab = "Quality Indicator",
ylab = "Type",
col = c("darkred", "yellow"),
border = "brown",
horizontal = FALSE,
notch = TRUE
)
# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
WinesUnstacked <- Wines %>%
group_by(Type, Quality) %>%
summarise(Count = n(), .groups = 'drop')
# Build BarPlot
barplot(
# Plot data
Count ~ Type + Quality, data = WinesUnstacked, beside = TRUE,
# Axis labels
xlab = "Type of Wine and Quality Score",
ylab = "Count of Wines",
ylim = c(0, 2500),
axes = TRUE,
# Graph title
main = "Wine Quality by Type",
subtitle = "Based on Real Data from Portugal",
# Add colour
col = c("darkred", "beige"),
# Add a legend
)
# Add labels with some offset to be above the bar
# text(x = , y = WinesUnstacked$Count,labels = round(WinesUnstacked$Count, digits = 2))
# Add Legend
legend("topleft", fill = c("darkred", "beige"), legend = c("Red Wine", "White Wine"),
horiz = FALSE, inset = c(0.05, 0.05), xpd = TRUE, cex = 1.2, bg = "white")
# Draw a box outline
box()
# Draw Gridlines
grid(NA, 5,
leg.txt,
lty = 3, # Grid line type
col = "lightgray", # Grid line color
lwd = 2) # Grid line width
# Check Normality
cat("\n\nNormality Test for Red Wine: ")
##
##
## Normality Test for Red Wine:
spvalue1 <- shapiro.test(subset(Wines, Type == "Red")$Quality)$p.value
if (spvalue1 < 0.05) {
cat("\nThe Red Wine Quality score is not normally distributed.")
} else {
cat("\nThe Red Wine Quality score is normally distributed.")
}
##
## The Red Wine Quality score is not normally distributed.
cat("\n\nNormality Test for White Wine: ")
##
##
## Normality Test for White Wine:
spvalue2 <- shapiro.test(subset(Wines, Type == "White")$Quality)$p.value
if (spvalue2 < 0.05) {
cat("\nThe White Wine Quality score is not normally distributed.")
} else {
cat("\nThe White Wine Quality score is normally distributed.")
}
##
## The White Wine Quality score is not normally distributed.
if (spvalue1 < 0.05 | spvalue2 < 0.05) {
# Perform Test on Difference
testOnDiff <- "Wilcoxon Rank Sum Test"
wtest <- wilcox.test(Wines$Quality ~ Wines$Type)
pvalue <- round(wtest$p.value, 4)
} else {
# Perform Test on Difference
testOnDiff <- "Student's t-Test"
ttest <- t.test(Quality ~ Type, data = Wines)
pvalue <- round(ttest$p.value, 4)
diff1 <- abs(ttest$conf.int[1])
diff2 <- abs(ttest$conf.int[2])
if (diff1 > diff2) {
diff0 <- diff2
diff2 <- diff1
diff1 <- diff0
}
}
cat("\n\nBased on", testOnDiff, "the p-value for assuming a difference between Quality of Red and White Wine is:", pvalue, ". ")
##
##
## Based on Wilcoxon Rank Sum Test the p-value for assuming a difference between Quality of Red and White Wine is: 0 .
if (pvalue < 0.05) {
cat("This means, there is a significant difference between the Quality of Red and White Wine.")
cat("\n\nThe average Quality rating for White Wine is", WhiteMean, "and for Red Wine is" , RedMean, ".")
if (spvalue1 > 0.05 & spvalue2 > 0.05) {
cat("\nThe difference is between", round(tdiff1, 4), "and", round((tdiff2), 4), ".")
}
} else {
cat("This means, there is no significant difference between the Quality of Red and White Wine.")
}
## This means, there is a significant difference between the Quality of Red and White Wine.
##
## The average Quality rating for White Wine is 5.878 and for Red Wine is 5.636 .
# Collapse 3 and 9
Wines$QualityFull <- Wines$Quality
Wines$QualityN <- case_when(Wines$Quality == 3 ~ 4,
Wines$Quality == 9 ~ 8,
!Wines$Quality %in% c(3,9) ~ Wines$Quality)
Wines$Quality <- as.factor(Wines$Quality)
Wines <- as.data.frame(Wines[, -15])
names(Wines)
## [1] "FixAcid" "VolAcid" "CitricAcid" "ResSugar" "Chlorid"
## [6] "FreeSulf" "TotalSulf" "Density" "PH" "Sulphates"
## [11] "Alcohol" "Quality" "Type" "ID" "QualityN"
# Plot Result Variable
ggplot(Wines,
aes(factor(QualityN))) +
geom_bar(fill = "darkred") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'none',
legend.title = element_text(size = 16),
legend.text = element_text(size = 16),
# axis.text.x=element_blank(), #remove x axis labels
# axis.ticks.x=element_blank(), #remove x axis ticks
# axis.text.y=element_blank(), #remove y axis labels
#axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Distribution of Wine Quality after Collapsing 3 and 9",
subtitle = "Based on Real Data from Portugal",
caption = "Data source: Public Domain",
fill = "Components",
x = "Quality Score",
y = "Count of Wines")
# Split data into train and test
set.seed(123)
split <- initial_split(Wines, prop = 0.80)
WinesTrain <- training(split)
WinesTest <- testing(split)
# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)
# Build Decision Tree 5.0
WineTree <- C5.0(Quality ~ .,
data = WinesTrain[, -15])
WineTree
##
## Call:
## C5.0.formula(formula = Quality ~ ., data = WinesTrain[, -15])
##
## Classification Tree
## Number of samples: 5197
## Number of predictors: 13
##
## Tree size: 635
##
## Non-standard options: attempt to group attributes
WineTreeW <- C5.0(Quality ~ ., data = WinesTrain[, -15],
winnow = TRUE,
trials = 5)
WineTreeW
##
## Call:
## C5.0.formula(formula = Quality ~ ., data = WinesTrain[, -15], winnow =
## TRUE, trials = 5)
##
## Classification Tree
## Number of samples: 5197
## Number of predictors: 13
##
## Number of boosting iterations: 5
## Average tree size: 536.4
##
## Non-standard options: attempt to group attributes
# Load Library if not available
if(! "yardstick" %in% installed.packages()) { install.packages("yardstick", dependencies = TRUE) }
library(yardstick)
class_wines <- tibble(value = WinesTrain$Quality,
predict = predict(WineTree, WinesTrain),
predictw = predict(WineTreeW, WinesTrain))
class_metrics <- metric_set(accuracy, precision, recall)
# Show confusion matrix
cat("\n\nShow Confusion Matrix:\n")
##
##
## Show Confusion Matrix:
conf_mat(class_wines, truth = value, estimate = predict)
## Truth
## Prediction 3 4 5 6 7 8 9
## 3 8 2 1 0 0 0 0
## 4 2 111 8 5 7 1 0
## 5 3 26 1482 155 31 5 0
## 6 6 22 217 2048 112 21 1
## 7 3 4 12 42 716 27 2
## 8 0 0 2 5 6 103 1
## 9 0 0 0 0 0 0 0
cat("\n\nShow Performance Metrics of Original Model:\n")
##
##
## Show Performance Metrics of Original Model:
class_metrics(class_wines, truth = value, estimate = predict)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.860
## 2 precision macro 0.840
## 3 recall macro 0.612
cat("\n\nShow Performance Metrics of Winnowing/Boosting Model:\n")
##
##
## Show Performance Metrics of Winnowing/Boosting Model:
class_metrics(class_wines, truth = value, estimate = predictw)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.975
## 2 precision macro 0.982
## 3 recall macro 0.934
# Save Predictions in WinesTrain
WinesTrain$Pred <- predict(WineTree, WinesTrain)
WinesTrain$PredW <- predict(WineTreeW, WinesTrain)
# Write data to working directory
write.csv(WinesTrain, file = "WinesTrain.csv")
# Show Variable Importance
cat("\n\nShow Variable Importance for Original Model:\n")
##
##
## Show Variable Importance for Original Model:
C5imp(WineTree)
## Overall
## Alcohol 100.00
## VolAcid 84.68
## FreeSulf 77.83
## FixAcid 71.41
## Sulphates 65.79
## PH 63.40
## CitricAcid 53.43
## Chlorid 51.86
## ResSugar 43.06
## ID 40.35
## Density 38.10
## TotalSulf 37.68
## Type 21.95
cat("\n\nShow Variable Importance for Model after Winnowing/Boosting:\n")
##
##
## Show Variable Importance for Model after Winnowing/Boosting:
C5imp(WineTreeW)
## Overall
## VolAcid 100.00
## Alcohol 100.00
## Sulphates 99.31
## PH 98.85
## TotalSulf 98.06
## FreeSulf 97.58
## Chlorid 97.54
## FixAcid 97.38
## CitricAcid 96.06
## ID 93.71
## ResSugar 93.53
## Density 87.20
## Type 63.21
# Calculate the MSE for the original tree
mse <- mean((as.numeric(as.character(WinesTrain$Pred)) - as.numeric(as.character(WinesTrain$Quality)))^2)
cat("\n\nMSE of the Original Tree is", mse)
##
##
## MSE of the Original Tree is 0.2492
# Calculate the MSE for the pruned tree
mse <- mean((as.numeric(as.character(WinesTrain$PredW)) - as.numeric(as.character(WinesTrain$Quality)))^2)
cat("\n\nMSE of the Winnowed/Boosted Tree is", mse)
##
##
## MSE of the Winnowed/Boosted Tree is 0.04175
# Extract Importance Data from Tree
C5TreeImp <- as.data.frame(C5imp(WineTree, metric = "usage"))
VarImp <- as.data.frame(C5TreeImp$Overall)
VarImp$Variable <- rownames(C5TreeImp)
colnames(VarImp)[1] <- "Importance"
C5TreeImpW <- as.data.frame(C5imp(WineTreeW, metric = "usage"))
VarImpW <- as.data.frame(C5TreeImpW$Overall)
VarImpW$Variable <- rownames(C5TreeImpW)
colnames(VarImpW)[1] <- "Importance"
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Create the Column Chart
ggplot(VarImp, aes(y = reorder(Variable, Importance), x = Importance, label = Variable, fill = reorder(Variable, Importance), desc = FALSE)) +
geom_col(stat = "identity", fill = "#7c0c18") +
geom_text(
aes(label = paste(round(Importance, 0), "%")),
hjust = 1, nudge_x = -.5,
size = 5, fontface = "bold", family = "Fira Sans", color = "white") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 5),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
plot.margin = margin(rep(15, 4)),
panel.background = element_blank(),
panel.grid.major = element_line(color = "lightgrey")
) +
labs(title = "Importance of Model Variables",
subtitle = "Based on Original Classification Tree of Wine Quality Data",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Create the Column Chart
ggplot(VarImpW, aes(y = reorder(Variable, Importance), x = Importance, label = Variable, fill = reorder(Variable, Importance), desc = FALSE)) +
geom_col(stat = "identity", fill = "#7c0c18") +
geom_text(
aes(label = paste(round(Importance, 0), "%")),
hjust = 1, nudge_x = -.5,
size = 5, fontface = "bold", family = "Fira Sans", color = "white") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 5),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
plot.margin = margin(rep(15, 4)),
panel.background = element_blank(),
panel.grid.major = element_line(color = "lightgrey")
) +
labs(title = "Importance of Model Variables After Winnowing/Boosting",
subtitle = "Based on Tree after Winnowing/Boosting",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Save Predictions in WinesTest
WinesTest$Pred <- predict(WineTree, WinesTest)
# Performance Indicators for Original Tree
# Calculate the MSE for the original tree
mse <- mean((as.numeric(WinesTest$Pred) - as.numeric(WinesTest$Quality))^2)
cat("\n\nMSE of the Original Tree is", mse)
##
##
## MSE of the Original Tree is 0.7192
cat("\nRMSE of the Original Tree is", sqrt(mse), "\n\n")
##
## RMSE of the Original Tree is 0.8481
cat("\n\nThe Confusion Matrix explains the following details:\n")
##
##
## The Confusion Matrix explains the following details:
cat("- Accuracy: The proportion of true results (both true positives and true negatives) among the total number of cases examined.\n")
## - Accuracy: The proportion of true results (both true positives and true negatives) among the total number of cases examined.
cat("- Kappa: A statistic that measures inter-rater agreement for categorical items. It is generally thought to be a more robust measure than simple percent agreement calculation since it takes into account the agreement occurring by chance.\n")
## - Kappa: A statistic that measures inter-rater agreement for categorical items. It is generally thought to be a more robust measure than simple percent agreement calculation since it takes into account the agreement occurring by chance.
cat("- No Information Rate (NIR): The accuracy that could be achieved by always predicting the most frequent class.\n")
## - No Information Rate (NIR): The accuracy that could be achieved by always predicting the most frequent class.
cat("- Pos Pred Value (Precision): The proportion of positive results in statistics and machine learning that are true positive results.\n")
## - Pos Pred Value (Precision): The proportion of positive results in statistics and machine learning that are true positive results.
cat("- Neg Pred Value: The proportion of negative results that are true negative results.\n")
## - Neg Pred Value: The proportion of negative results that are true negative results.
cat("- Prevalence: The proportion of the dataset that is positive.\n")
## - Prevalence: The proportion of the dataset that is positive.
cat("- Detection Rate: The rate at which positive instances are detected.\n")
## - Detection Rate: The rate at which positive instances are detected.
cat("- Detection Prevalence: The proportion of predicted positives.\n")
## - Detection Prevalence: The proportion of predicted positives.
cat("- Balanced Accuracy: The average of sensitivity and specificity.\n\n")
## - Balanced Accuracy: The average of sensitivity and specificity.
confMat <- confusionMatrix((WinesTest$Pred), (WinesTest$Quality))
confMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 2 9 11 10 3 0 0
## 5 4 21 233 133 12 1 1
## 6 1 17 158 375 85 18 0
## 7 1 3 12 60 98 14 0
## 8 0 1 2 3 9 3 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.552
## 95% CI : (0.525, 0.58)
## No Information Rate : 0.447
## P-Value [Acc > NIR] : 0.0000000000000164
##
## Kappa : 0.312
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.17647 0.560 0.645 0.4734 0.08333
## Specificity 1.00000 0.97918 0.805 0.612 0.9177 0.98813
## Pos Pred Value NaN 0.25714 0.575 0.573 0.5213 0.16667
## Neg Pred Value 0.99385 0.96680 0.796 0.681 0.9020 0.97426
## Prevalence 0.00615 0.03923 0.320 0.447 0.1592 0.02769
## Detection Rate 0.00000 0.00692 0.179 0.288 0.0754 0.00231
## Detection Prevalence 0.00000 0.02692 0.312 0.503 0.1446 0.01385
## Balanced Accuracy 0.50000 0.57783 0.683 0.629 0.6955 0.53573
## Class: 9
## Sensitivity 0.000000
## Specificity 1.000000
## Pos Pred Value NaN
## Neg Pred Value 0.999231
## Prevalence 0.000769
## Detection Rate 0.000000
## Detection Prevalence 0.000000
## Balanced Accuracy 0.500000
Sensitivity <- confMat$byClass['Sensitivity']
Specificity <- confMat$byClass['Specificity']
# Save Predictions in WinesTest
WinesTest$PredW <- predict(WineTreeW, WinesTest)
# Performance Indicators for Winnowed/Boosted Tree
# Calculate the MSE for the Winnowed/Boosted tree
mse <- mean((as.numeric(WinesTest$PredW) - as.numeric(WinesTest$Quality))^2)
cat("\n\nMSE of the Winnowed/Boosted Tree is", mse)
##
##
## MSE of the Winnowed/Boosted Tree is 0.5523
cat("\nRMSE of the Winnowed/Boosted Tree is", sqrt(mse), "\n\n")
##
## RMSE of the Winnowed/Boosted Tree is 0.7432
confMat <- confusionMatrix((WinesTest$PredW), (WinesTest$Quality))
confMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 3 9 10 2 0 0 0
## 5 4 27 264 113 6 0 1
## 6 0 12 131 415 92 14 0
## 7 1 2 10 48 103 9 0
## 8 0 1 1 3 6 13 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.618
## 95% CI : (0.591, 0.645)
## No Information Rate : 0.447
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.41
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.17647 0.635 0.714 0.4976 0.3611
## Specificity 1.00000 0.98799 0.829 0.654 0.9360 0.9913
## Pos Pred Value NaN 0.37500 0.636 0.625 0.5954 0.5417
## Neg Pred Value 0.99385 0.96708 0.828 0.739 0.9077 0.9820
## Prevalence 0.00615 0.03923 0.320 0.447 0.1592 0.0277
## Detection Rate 0.00000 0.00692 0.203 0.319 0.0792 0.0100
## Detection Prevalence 0.00000 0.01846 0.319 0.511 0.1331 0.0185
## Balanced Accuracy 0.50000 0.58223 0.732 0.684 0.7168 0.6762
## Class: 9
## Sensitivity 0.000000
## Specificity 1.000000
## Pos Pred Value NaN
## Neg Pred Value 0.999231
## Prevalence 0.000769
## Detection Rate 0.000000
## Detection Prevalence 0.000000
## Balanced Accuracy 0.500000
Sensitivity <- confMat$byClass['Sensitivity']
Specificity <- confMat$byClass['Specificity']
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(as.character(Wines$Quality))
# Run Linear Regression with all Xs
LinReg09 <- lm(QualityNum ~ FixAcid + VolAcid + CitricAcid + ResSugar + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol + Type, data = Wines)
summary(LinReg09)
##
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + CitricAcid + ResSugar +
## Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates +
## Alcohol + Type, data = Wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.780 -0.467 -0.044 0.456 3.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.751754 14.135540 7.41 0.00000000000014 ***
## FixAcid 0.085066 0.015764 5.40 0.00000007046421 ***
## VolAcid -1.492406 0.081351 -18.35 < 0.0000000000000002 ***
## CitricAcid -0.062621 0.079720 -0.79 0.432
## ResSugar 0.062437 0.005934 10.52 < 0.0000000000000002 ***
## Chlorid -0.757251 0.334445 -2.26 0.024 *
## FreeSulf 0.004937 0.000766 6.44 0.00000000012535 ***
## TotalSulf -0.001403 0.000324 -4.33 0.00001490646374 ***
## Density -103.909624 14.335952 -7.25 0.00000000000047 ***
## PH 0.498756 0.090579 5.51 0.00000003805505 ***
## Sulphates 0.721744 0.076243 9.47 < 0.0000000000000002 ***
## Alcohol 0.222670 0.018074 12.32 < 0.0000000000000002 ***
## TypeWhite -0.361332 0.056753 -6.37 0.00000000020631 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.733 on 6484 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.295
## F-statistic: 228 on 12 and 6484 DF, p-value: <0.0000000000000002
anova(LinReg09)
## Analysis of Variance Table
##
## Response: QualityNum
## Df Sum Sq Mean Sq F value Pr(>F)
## FixAcid 1 29 29 54.29 0.00000000000019 ***
## VolAcid 1 322 322 599.75 < 0.0000000000000002 ***
## CitricAcid 1 0 0 0.65 0.42
## ResSugar 1 42 42 78.17 < 0.0000000000000002 ***
## Chlorid 1 63 63 116.73 < 0.0000000000000002 ***
## FreeSulf 1 1 1 2.35 0.13
## TotalSulf 1 188 188 349.95 < 0.0000000000000002 ***
## Density 1 339 339 630.09 < 0.0000000000000002 ***
## PH 1 184 184 343.28 < 0.0000000000000002 ***
## Sulphates 1 140 140 261.06 < 0.0000000000000002 ***
## Alcohol 1 138 138 256.38 < 0.0000000000000002 ***
## Type 1 22 22 40.54 0.00000000020631 ***
## Residuals 6484 3485 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(LinReg09)
## FixAcid VolAcid CitricAcid ResSugar Chlorid FreeSulf TotalSulf
## 5.048 2.168 1.622 9.635 1.659 2.236 4.046
## Density PH Sulphates Alcohol Type
## 22.337 2.564 1.556 5.617 7.224
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)
# Run Linear Regression with all Xs
LinReg10 <- lm(QualityNum ~ FixAcid + VolAcid + CitricAcid + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol + Type, data = Wines)
summary(LinReg10)
##
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + CitricAcid + Chlorid +
## FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol +
## Type, data = Wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.010 -0.467 -0.038 0.467 3.154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.502639 5.715140 -5.86 0.0000000047955380 ***
## FixAcid -0.031785 0.011282 -2.82 0.0049 **
## VolAcid -1.560064 0.081779 -19.08 < 0.0000000000000002 ***
## CitricAcid -0.085521 0.080362 -1.06 0.2873
## Chlorid -1.209213 0.334468 -3.62 0.0003 ***
## FreeSulf 0.006134 0.000764 8.03 0.0000000000000012 ***
## TotalSulf -0.001679 0.000325 -5.16 0.0000002539355998 ***
## Density 34.590746 5.726875 6.04 0.0000000016257380 ***
## PH -0.080357 0.072546 -1.11 0.2680
## Sulphates 0.534056 0.074751 7.14 0.0000000000010023 ***
## Alcohol 0.361938 0.012411 29.16 < 0.0000000000000002 ***
## TypeWhite -0.062943 0.049574 -1.27 0.2042
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.739 on 6485 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.283
## F-statistic: 234 on 11 and 6485 DF, p-value: <0.0000000000000002
anova(LinReg10)
## Analysis of Variance Table
##
## Response: QualityNum
## Df Sum Sq Mean Sq F value Pr(>F)
## FixAcid 1 29 29 53.38 0.0000000000003076 ***
## VolAcid 1 322 322 589.77 < 0.0000000000000002 ***
## CitricAcid 1 0 0 0.64 0.42
## Chlorid 1 56 56 103.36 < 0.0000000000000002 ***
## FreeSulf 1 12 12 21.96 0.0000028340998068 ***
## TotalSulf 1 223 223 408.45 < 0.0000000000000002 ***
## Density 1 182 182 333.84 < 0.0000000000000002 ***
## PH 1 36 36 64.96 0.0000000000000009 ***
## Sulphates 1 80 80 147.05 < 0.0000000000000002 ***
## Alcohol 1 467 467 853.85 < 0.0000000000000002 ***
## Type 1 1 1 1.61 0.20
## Residuals 6485 3544 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(LinReg10)
## FixAcid VolAcid CitricAcid Chlorid FreeSulf TotalSulf Density
## 2.543 2.155 1.621 1.632 2.186 4.019 3.505
## PH Sulphates Alcohol Type
## 1.617 1.471 2.604 5.421
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)
# Run Linear Regression with all Xs
LinReg11 <- lm(QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + TotalSulf + Density + PH + Sulphates + Alcohol, data = Wines)
summary(LinReg11)
##
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf +
## TotalSulf + Density + PH + Sulphates + Alcohol, data = Wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.974 -0.466 -0.038 0.463 3.135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.879569 5.523150 -6.32 0.000000000287744 ***
## FixAcid -0.031637 0.010248 -3.09 0.00203 **
## VolAcid -1.481112 0.067980 -21.79 < 0.0000000000000002 ***
## Chlorid -1.173701 0.322055 -3.64 0.00027 ***
## FreeSulf 0.006314 0.000754 8.37 < 0.0000000000000002 ***
## TotalSulf -0.001967 0.000265 -7.42 0.000000000000137 ***
## Density 35.787370 5.567088 6.43 0.000000000138138 ***
## PH -0.044325 0.068967 -0.64 0.52044
## Sulphates 0.554740 0.072280 7.67 0.000000000000019 ***
## Alcohol 0.360322 0.012273 29.36 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.739 on 6487 degrees of freedom
## Multiple R-squared: 0.284, Adjusted R-squared: 0.283
## F-statistic: 286 on 9 and 6487 DF, p-value: <0.0000000000000002
anova(LinReg11)
## Analysis of Variance Table
##
## Response: QualityNum
## Df Sum Sq Mean Sq F value Pr(>F)
## FixAcid 1 29 29 53.4 0.0000000000003088 ***
## VolAcid 1 322 322 589.7 < 0.0000000000000002 ***
## Chlorid 1 57 57 103.8 < 0.0000000000000002 ***
## FreeSulf 1 12 12 21.2 0.0000042706747999 ***
## TotalSulf 1 213 213 389.7 < 0.0000000000000002 ***
## Density 1 187 187 342.8 < 0.0000000000000002 ***
## PH 1 33 33 60.8 0.0000000000000073 ***
## Sulphates 1 83 83 152.2 < 0.0000000000000002 ***
## Alcohol 1 471 471 861.9 < 0.0000000000000002 ***
## Residuals 6487 3546 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# Making a Numeric Quality Column
Wines$QualityNum <- as.numeric(Wines$Quality)
# Run Linear Regression with all Xs
LinReg12 <- lm(QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf + TotalSulf + Density + Sulphates + Alcohol, data = Wines)
summary(LinReg12)
##
## Call:
## lm(formula = QualityNum ~ FixAcid + VolAcid + Chlorid + FreeSulf +
## TotalSulf + Density + Sulphates + Alcohol, data = Wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.992 -0.466 -0.039 0.463 3.131
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.121460 5.395477 -6.32 0.000000000271688 ***
## FixAcid -0.028545 0.009049 -3.15 0.00162 **
## VolAcid -1.488211 0.067074 -22.19 < 0.0000000000000002 ***
## Chlorid -1.159040 0.321232 -3.61 0.00031 ***
## FreeSulf 0.006310 0.000754 8.37 < 0.0000000000000002 ***
## TotalSulf -0.001932 0.000260 -7.44 0.000000000000111 ***
## Density 34.877025 5.383638 6.48 0.000000000099540 ***
## Sulphates 0.546221 0.071051 7.69 0.000000000000017 ***
## Alcohol 0.358852 0.012058 29.76 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.739 on 6488 degrees of freedom
## Multiple R-squared: 0.284, Adjusted R-squared: 0.283
## F-statistic: 322 on 8 and 6488 DF, p-value: <0.0000000000000002
anova(LinReg12)
## Analysis of Variance Table
##
## Response: QualityNum
## Df Sum Sq Mean Sq F value Pr(>F)
## FixAcid 1 29 29 53.4 0.00000000000031 ***
## VolAcid 1 322 322 589.7 < 0.0000000000000002 ***
## Chlorid 1 57 57 103.8 < 0.0000000000000002 ***
## FreeSulf 1 12 12 21.2 0.00000426640941 ***
## TotalSulf 1 213 213 389.8 < 0.0000000000000002 ***
## Density 1 187 187 342.8 < 0.0000000000000002 ***
## Sulphates 1 103 103 188.9 < 0.0000000000000002 ***
## Alcohol 1 484 484 885.7 < 0.0000000000000002 ***
## Residuals 6488 3546 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Get the ANOVA table
anova_table <- anova(LinReg12)
# Extract the sum of squares
SumSq <- anova_table$`Sum Sq`
# Prepare data for the pie chart
SSdf <- data.frame(
Component = rownames(anova_table),
SumSq = SumSq
)
# Sort by SumSq
SSdf <- SSdf[order(SSdf$SumSq, decreasing = TRUE),]
# Add Cum Sum Column
SSdf$Percent <- SSdf$SumSq / sum(SSdf$SumSq)
CumSum <- cumsum(SSdf$SumSq)
# Add Percentage Column
SSdf$Percent <- paste(round(SSdf$Percent * 100, 2), "%")
# Add Label Column
Labels <- as.data.frame(paste(SSdf$Component, round(SSdf$SumSq, 0), SSdf$Percent, sep = "\n"))
names(Labels)[1] <- "Labels"
SSdf <- bind_cols(SSdf, Labels)
SSdf
## Component SumSq Percent Labels
## 1 Residuals 3546.12 71.59 % Residuals\n3546\n71.59 %
## 2 Alcohol 484.11 9.77 % Alcohol\n484\n9.77 %
## 3 VolAcid 322.33 6.51 % VolAcid\n322\n6.51 %
## 4 TotalSulf 213.04 4.3 % TotalSulf\n213\n4.3 %
## 5 Density 187.36 3.78 % Density\n187\n3.78 %
## 6 Sulphates 103.23 2.08 % Sulphates\n103\n2.08 %
## 7 Chlorid 56.74 1.15 % Chlorid\n57\n1.15 %
## 8 FixAcid 29.17 0.59 % FixAcid\n29\n0.59 %
## 9 FreeSulf 11.57 0.23 % FreeSulf\n12\n0.23 %
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )
# Create the pie chart
ggplot(SSdf, aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 30 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Wine Data from Portugal",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Load Library if not available
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )
# Create the pie chart
ggplot(SSdf[-c(1),], aes(x = "", y = SumSq, fill = reorder(Component, -SumSq), Total, .desc = TRUE)) +
geom_bar(stat = "identity", width = 1) +
geom_label(aes(label = Labels, fill_alpha("white", 0)),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y", start = 60 * (pi / 180), direction = -1) +
scale_fill_manual(values = Colours) +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 0),
legend.position = 'right',
legend.title = element_text(size = 16),
legend.text = element_text(size = 13),
axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(title = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Wine Data from Portugal",
caption = "Data source: Public Domain",
fill = "Components",
x = "",
y = "")
# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)
# Prepare Table
Pareto <- (SSdf$SumSq)
names(Pareto) <- SSdf$Component
# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5 # Adjust x-position if needed
line_y <- SSdf$CumSum
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )
# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
ylab = "Sum of Squares", # label y left
# colors of the chart
col = Colours,
# ranges of the percentages at the right
cumperc = seq(0, 100, by = 20),
# label y right
ylab2 = "Cumulative Percentage",
# title of the chart
main = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
# font size
cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)
##
## Pareto chart analysis for Pareto
## Frequency Cum.Freq. Percentage Cum.Percent.
## Residuals 3546.1198 3546.1198 71.5855 71.5855
## Alcohol 484.1102 4030.2301 9.7727 81.3582
## VolAcid 322.3276 4352.5577 6.5068 87.8650
## TotalSulf 213.0370 4565.5947 4.3006 92.1656
## Density 187.3637 4752.9583 3.7823 95.9479
## Sulphates 103.2340 4856.1923 2.0840 98.0319
## Chlorid 56.7437 4912.9360 1.1455 99.1774
## FixAcid 29.1748 4942.1109 0.5890 99.7663
## FreeSulf 11.5748 4953.6857 0.2337 100.0000
# Plot the points on the cumulative percentage line
lines(line_x, line_y, pch = 30, col = "red")
# Add text labels to the cumulative percentage line
text(line_x, line_y, labels = round(CumSum * 100, 1), pos = 3, col = "red")
# Load Library if not available
if(! "qcc" %in% installed.packages()) { install.packages("qcc", dependencies = TRUE) }
library(qcc)
# Prepare Table
Pareto <- SSdf[-c(1),]$SumSq
names(Pareto) <- SSdf[-c(1),]$Component
# Add labels to the cumulative percentage line
line_x <- seq_along(SSdf$Component) + 0.5 # Adjust x-position if needed
line_y <- SSdf$CumSum
# Define colours
Colours <- c("#FFB3BA", "#FFFFBA", "#BAFFC9", "#FFDFBA", "#BAE1FF", "#cfcfcf", "#CBAEFF", "#F010BA", "#65AABA" )
# Create the Pareto chart
pareto.chart(Pareto, xlab = " ", # x-axis label
ylab = "Sum of Squares", # label y left
# colors of the chart
col = Colours,
# ranges of the percentages at the right
cumperc = seq(0, 100, by = 20),
# label y right
ylab2 = "Cumulative Percentage",
# title of the chart
main = "Sum of Squares from Regression Model Summary",
subtitle = "Based on Cereal Data",
# font size
cex.names = 1.4, cex.axis = 1.2, cex.lab = 1.3, cex.main = 2.0,
)
##
## Pareto chart analysis for Pareto
## Frequency Cum.Freq. Percentage Cum.Percent.
## Alcohol 484.1102 484.1102 34.3934 34.3934
## VolAcid 322.3276 806.4379 22.8996 57.2931
## TotalSulf 213.0370 1019.4748 15.1351 72.4282
## Density 187.3637 1206.8385 13.3112 85.7394
## Sulphates 103.2340 1310.0725 7.3342 93.0736
## Chlorid 56.7437 1366.8162 4.0313 97.1050
## FixAcid 29.1748 1395.9910 2.0727 99.1777
## FreeSulf 11.5748 1407.5659 0.8223 100.0000
# Calculate the frequency and cumulative percentage for each Type
SSdf$SumSq <- round(SSdf$SumSq, digits = 0)
SSdf$Percent <- as.numeric(gsub(" %", "", SSdf$Percent))
SSdf$CumPerc <- cumsum(SSdf$Percent)
# Scale the cumulative percentage values to match the height of the first bar
SSdf$CumPercScaled <- SSdf$CumPerc * sum(SSdf$SumSq) / 100 # Scale CumPerc to the same range as SumSq
# Define y-axis breaks for alignment
primary_breaks <- seq(0, max(SSdf$CumPercScaled) + 200, by = 500) # Grid for primary axis
secondary_breaks <- seq(0, 105, by = 10) # Grid for secondary axis
# Plot the Pareto chart
Plot <- ggplot(SSdf, aes(x = reorder(Component, -SumSq), y = SumSq)) +
geom_bar(stat = "identity", fill = "steelblue") + # Bars for frequency
geom_text(aes(label = SumSq), vjust = -2.0, size = 4) + # Labels for bars
geom_line(aes(y = CumPerc * sum(SumSq) / 100, group = 1), # Line for cumulative %
color = "red", size = 1) +
geom_point(aes(y = CumPerc * sum(SumSq) / 100), color = "red") + # Points on line
# Labels for cumulative percentage (Corrected Scaling)
geom_text(aes(
y = CumPercScaled,
label = sprintf("%.1f%%", CumPerc)
), color = "red", vjust = -0.5, size = 4) +
# **Corrected Secondary Axis Scaling**
scale_y_continuous(
breaks = primary_breaks, # Primary y-axis grid
sec.axis = sec_axis(
~ . * 100 / sum(SSdf$SumSq), # Correct transformation for secondary axis
name = "Cumulative Percentage",
breaks = secondary_breaks # Secondary y-axis grid
)
) +
labs(title = "Pareto Chart of Sum of Squares - Importance of Factors",
x = NULL,
y = "Frequency") +
theme_minimal() + # Ensure gridlines are clearly visible
theme(
axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5),
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
panel.grid.major.y = element_line(color = "gray", size = 0.5), # Adjust grid style
panel.grid.minor.y = element_blank() # Remove minor grid lines
)
# Print the plot
print(Plot)
# Save Predictions in WinesTrain
Wines$PredCart <- predict(WineTree, Wines)
Wines$PredCartW <- predict(WineTreeW, Wines)
Wines$PredLin <- predict(LinReg12, Wines)
RMSEcart <- sqrt(mean((as.numeric(as.character(Wines$PredCart)) - as.numeric(as.character(Wines$Quality))) ^2))
RMSEcartW <- sqrt(mean((as.numeric(as.character(Wines$PredCartW)) - as.numeric(as.character(Wines$Quality))) ^2))
RMSElin <- sqrt(mean((as.numeric(as.character(Wines$PredLin)) - as.numeric(as.character(Wines$Quality))) ^2))
cat("\n\nRMSE of Wine Model with CART: ", RMSEcart)
##
##
## RMSE of Wine Model with CART: 0.5859
cat("\n\nRMSE of Wine Model with CART after Winnowing/Boosting: ", RMSEcartW)
##
##
## RMSE of Wine Model with CART after Winnowing/Boosting: 0.3794
cat("\n\nRMSE of Wine Model with Linear Regression: ", RMSElin)
##
##
## RMSE of Wine Model with Linear Regression: 2.132