# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
## Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)
if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)
# Setting working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
# Set number of decimals
options(digits = 4) # Modify global options
# Download Data from URL
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
Cereals$ID <- 1:nrow(Cereals)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Cereals)
## [1] "X" "name" "mfr" "type" "calories" "protein"
## [7] "fat" "sodium" "fiber" "carbo" "sugars" "potass"
## [13] "vitamins" "shelf" "weight" "cups" "rating" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Cereals)
## 'data.frame': 76 obs. of 18 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ name : Factor w/ 76 levels "100% Bran","100% Natural Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ mfr : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
## $ type : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 -1 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Cereals, 5)
## X name mfr type calories protein fat sodium fiber carbo
## 1 1 100% Bran N C 70 4 1 130 10 5
## 2 2 100% Natural Bran Q C 120 3 5 15 2 8
## 3 3 All-Bran K C 70 4 1 260 9 7
## 4 4 All-Bran with Extra Fiber K C 50 4 0 140 14 8
## 5 5 Almond Delight R C 110 2 2 200 1 14
## sugars potass vitamins shelf weight cups rating ID
## 1 6 280 25 3 1 0.33 68.40 1
## 2 8 135 0 3 1 1.00 33.98 2
## 3 5 320 25 3 1 0.33 59.43 3
## 4 0 330 25 3 1 0.50 93.70 4
## 5 8 -1 25 3 1 0.75 34.38 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Cereals, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25
## 1 X 76 38.75 22.412 1 19.75
## 2 name 76
## 3 ... 100% Bran 1 1.316%
## 4 ... 100% Natural Bran 1 1.316%
## 5 ... All-Bran 1 1.316%
## 6 ... All-Bran with Extra Fiber 1 1.316%
## 7 ... Almond Delight 1 1.316%
## 8 ... Apple Cinnamon Cheerios 1 1.316%
## 9 ... Apple Jacks 1 1.316%
## 10 ... Basic 4 1 1.316%
## 11 ... Bran Chex 1 1.316%
## 12 ... Bran Flakes 1 1.316%
## 13 ... Cap'n'Crunch 1 1.316%
## 14 ... Cheerios 1 1.316%
## 15 ... Cinnamon Toast Crunch 1 1.316%
## 16 ... Clusters 1 1.316%
## 17 ... Cocoa Puffs 1 1.316%
## 18 ... Corn Chex 1 1.316%
## 19 ... Corn Flakes 1 1.316%
## 20 ... Corn Pops 1 1.316%
## 21 ... Count Chocula 1 1.316%
## 22 ... Cracklin' Oat Bran 1 1.316%
## 23 ... Cream of Wheat (Quick) 1 1.316%
## 24 ... Crispix 1 1.316%
## 25 ... Crispy Wheat & Raisins 1 1.316%
## 26 ... Double Chex 1 1.316%
## 27 ... Froot Loops 1 1.316%
## 28 ... Frosted Flakes 1 1.316%
## 29 ... Frosted Mini-Wheats 1 1.316%
## 30 ... Fruit & Fibre Dates; Walnuts; and Oats 1 1.316%
## 31 ... Fruitful Bran 1 1.316%
## 32 ... Fruity Pebbles 1 1.316%
## 33 ... Golden Crisp 1 1.316%
## 34 ... Golden Grahams 1 1.316%
## 35 ... Grape-Nuts 1 1.316%
## 36 ... Grape Nuts Flakes 1 1.316%
## 37 ... Great Grains Pecan 1 1.316%
## 38 ... Honey-comb 1 1.316%
## 39 ... Honey Graham Ohs 1 1.316%
## 40 ... Honey Nut Cheerios 1 1.316%
## 41 ... Just Right Crunchy Nuggets 1 1.316%
## 42 ... Just Right Fruit & Nut 1 1.316%
## 43 ... Kix 1 1.316%
## 44 ... Life 1 1.316%
## 45 ... Lucky Charms 1 1.316%
## 46 ... Maypo 1 1.316%
## 47 ... Muesli Raisins; Dates; & Almonds 1 1.316%
## 48 ... Muesli Raisins; Peaches; & Pecans 1 1.316%
## 49 ... Mueslix Crispy Blend 1 1.316%
## 50 ... Multi-Grain Cheerios 1 1.316%
## 51 ... Nut&Honey Crunch 1 1.316%
## 52 ... Nutri-Grain Almond-Raisin 1 1.316%
## 53 ... Nutri-grain Wheat 1 1.316%
## 54 ... Oatmeal Raisin Crisp 1 1.316%
## 55 ... Post Nat. Raisin Bran 1 1.316%
## 56 ... Product 19 1 1.316%
## 57 ... Puffed Rice 1 1.316%
## 58 ... Puffed Wheat 1 1.316%
## 59 ... Quaker Oat Squares 1 1.316%
## 60 ... Raisin Bran 1 1.316%
## 61 ... Raisin Nut Bran 1 1.316%
## 62 ... Raisin Squares 1 1.316%
## 63 ... Rice Chex 1 1.316%
## 64 ... Rice Krispies 1 1.316%
## 65 ... Shredded Wheat 1 1.316%
## 66 ... Shredded Wheat 'n'Bran 1 1.316%
## 67 ... Shredded Wheat spoon size 1 1.316%
## 68 ... Smacks 1 1.316%
## 69 ... Special K 1 1.316%
## 70 ... Strawberry Fruit Wheats 1 1.316%
## 71 ... Total Corn Flakes 1 1.316%
## 72 ... Total Raisin Bran 1 1.316%
## 73 ... Total Whole Grain 1 1.316%
## 74 ... Triples 1 1.316%
## 75 ... Trix 1 1.316%
## 76 ... Wheat Chex 1 1.316%
## 77 ... Wheaties 1 1.316%
## 78 ... Wheaties Honey Gold 1 1.316%
## 79 mfr 76
## 80 ... A 1 1.316%
## 81 ... G 22 28.947%
## 82 ... K 23 30.263%
## 83 ... N 6 7.895%
## 84 ... P 9 11.842%
## 85 ... Q 7 9.211%
## 86 ... R 8 10.526%
## 87 type 76
## 88 ... C 74 97.368%
## 89 ... H 2 2.632%
## 90 calories 76 106.97 19.597 50 100
## 91 protein 76 2.5132 1.0645 1 2
## 92 fat 76 1 1.0066 0 0
## 93 sodium 76 161.78 82.323 0 133.75
## 94 fiber 76 2.1447 2.3984 0 0.75
## 95 carbo 76 14.803 3.9073 5 12
## 96 sugars 76 7.0263 4.3787 0 3
## 97 potass 76 95.895 71.742 -1 40
## 98 vitamins 76 28.618 22.25 0 25
## 99 shelf 76 2.2237 0.82622 1 1.75
## 100 weight 76 1.03 0.15144 0.5 1
## 101 cups 76 0.82303 0.2336 0.25 0.67
## 102 rating 76 42.558 14.109 18.043 32.932
## 103 ID 76 38.5 22.083 1 19.75
## P50 P75 Max NA Mode
## 1 38.5 57.5 77 0
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
## 51
## 52
## 53
## 54
## 55
## 56
## 57
## 58
## 59
## 60
## 61
## 62
## 63
## 64
## 65
## 66
## 67
## 68
## 69
## 70
## 71
## 72
## 73
## 74
## 75
## 76
## 77
## 78
## 79
## 80
## 81
## 82
## 83
## 84
## 85
## 86
## 87
## 88
## 89
## 90 110 110 160 0
## 91 2.5 3 6 0
## 92 1 1.25 5 0
## 93 180 212.5 320 0
## 94 1.75 3 14 0
## 95 14.5 17 23 0
## 96 7 11 15 0
## 97 90 120 330 0
## 98 25 25 100 0
## 99 2 3 3 0
## 100 1 1 1.5 0
## 101 0.75 1 1.5 0
## 102 40.253 50.972 93.705 0
## 103 38.5 57.25 76 0
# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
# Scale data for neural network
scaled <- as.data.frame(mutate(Cereals, across(c(5:13,15:17), MinMax.Fun)))
# Random sampling
samplesize = 0.60 * nrow(scaled)
set.seed(80)
index = sample( seq_len ( nrow ( scaled ) ), size = samplesize )
# creating training and test set
CerealsTrain = scaled[index , ]
CerealsTest = scaled[-index , ]
# Keep only Numeric Variables
CerealsNum <- as.data.frame(scaled[, -c(1:3, 13:15)])
head(CerealsTrain)
## X name mfr type calories protein fat sodium fiber
## 11 11 Cap'n'Crunch Q C 0.6364 0.0 0.4 0.6875 0.00000
## 43 43 Lucky Charms G C 0.5455 0.2 0.2 0.5625 0.00000
## 31 31 Golden Crisp P C 0.4545 0.2 0.0 0.1406 0.00000
## 65 66 Shredded Wheat spoon size N C 0.3636 0.4 0.0 0.0000 0.21429
## 50 50 Nutri-Grain Almond-Raisin K C 0.8182 0.4 0.4 0.6875 0.21429
## 36 36 Honey Graham Ohs Q C 0.6364 0.0 0.4 0.6875 0.07143
## carbo sugars potass vitamins shelf weight cups rating ID
## 11 0.3889 0.8000 0.1088 0.25 2 0.50 0.400 0.0000 11
## 43 0.3889 0.8000 0.1692 0.25 2 0.50 0.600 0.1149 43
## 31 0.3333 1.0000 0.1239 0.25 1 0.50 0.504 0.2275 31
## 65 0.8333 0.0000 0.3656 0.00 1 0.50 0.336 0.7237 65
## 50 0.8889 0.4667 0.3958 0.25 3 0.83 0.336 0.2994 50
## 36 0.3889 0.7333 0.1390 0.25 2 0.50 0.600 0.0506 36
# Write data to working directory
write.csv(CerealsTrain, file = "CerealsTrain.csv")
write.csv(CerealsTest, file = "CerealsTest.csv")
# Load Library if not available
if(! "neuralnet" %in% installed.packages()) { install.packages("neuralnet", dependencies = TRUE) }
library(neuralnet)
if(! "grid" %in% installed.packages()) { install.packages("grid", dependencies = TRUE) }
library(grid)
# fit neural network
set.seed(2)
NN = neuralnet(rating ~ calories + protein + fat + sodium + fiber + sugars, CerealsTrain, hidden = 3 , linear.output = T )
# plot neural network
plot(NN)
# Add custom title with grid.text
grid.text("Artificial Neural Network for Cereals Data",
x = 0.5, y = 0.99, just = "center", gp = gpar(fontsize = 20))
# Plot Garson's importance of factors
Gar.Fun('y', NN)
# Use Test Dataset to Predict Rating
CerealsTest$predictRating = predict(NN, CerealsTest)
# Load Library if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
# Show Scatter Plot of Predicted Versus Real Rating
plot(CerealsTest$rating, CerealsTest$predictRating, col='blue', pch = 20,
ylab = "Predicted Rating Using NN",
xlab = "Real Rating")
text(CerealsTest$rating, CerealsTest$predictRating, labels = CerealsTest$name, pos = 2)
abline(0,1, col = "red")
# Calculate Root Mean Square Error (RMSE)
RMSE.NN = (sum((CerealsTest$rating - CerealsTest$predictRating)^2) / nrow(CerealsTest)) ^ 0.5
cat("\n\nRMSE: ", RMSE.NN)
##
##
## RMSE: 0.02679
# Download Data from URL
Adults <- read.csv("https://www.coe-data.com/Data/SIM/Adults.csv", stringsAsFactors=T)
# Collapse Dataset so that we will work with a small sample of data
Adults <- Adults[1:800,]
# Collapse Categories
levels(Adults$marital.status)[2:4] <- "Married"
levels(Adults$workclass)[c(2,3,8)] <- "Gov"
levels(Adults$workclass)[c(5, 6)] <- "Self"
# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)
# Prepare Columns
names(Adults) <- str_to_title(names(Adults))
names(Adults) <- substr(names(Adults), 1, 6)
colnames(Adults)[2] <- "Class"
colnames(Adults)[5] <- "EduNum"
colnames(Adults)[6] <- "Marital"
colnames(Adults)[7] <- "Occupat"
colnames(Adults)[8] <- "Relationship"
colnames(Adults)[11] <- "CapGain"
colnames(Adults)[12] <- "CapLoss"
colnames(Adults)[13] <- "WeekHours"
head(Adults,4)
## Age Class Fnlwgt Educat EduNum Marital Occupat
## 1 25 Private 226802 11th 7 Never-married Machine-op-inspct
## 2 38 Private 89814 HS-grad 9 Married Farming-fishing
## 3 28 Gov 336951 Assoc-acdm 12 Married Protective-serv
## 4 44 Private 160323 Some-college 10 Married Machine-op-inspct
## Relationship Race Gender CapGain CapLoss WeekHours Native Income
## 1 Own-child Black Male 0 0 40 United-States <=50K
## 2 Husband White Male 0 0 50 United-States <=50K
## 3 Husband White Male 0 0 40 United-States >50K
## 4 Husband Black Male 7688 0 40 United-States >50K
names(Adults)
## [1] "Age" "Class" "Fnlwgt" "Educat" "EduNum"
## [6] "Marital" "Occupat" "Relationship" "Race" "Gender"
## [11] "CapGain" "CapLoss" "WeekHours" "Native" "Income"
# Determine how many Indicator variables are needed
unique(Adults$Income) # One variable for income
## [1] <=50K >50K
## Levels: <=50K >50K
unique(Adults$Gender) # One variable for sex
## [1] Male Female
## Levels: Female Male
unique(Adults$Race) # Four variables for race
## [1] Black White Asian-Pac-Islander Other
## [5] Amer-Indian-Eskimo
## Levels: Amer-Indian-Eskimo Asian-Pac-Islander Black Other White
unique(Adults$Class) # Three variables for workclass
## [1] Private Gov ? Self
## Levels: ? Gov Never-worked Private Self Without-pay
unique(Adults$Marital) # Four variables for marital.status
## [1] Never-married Married Widowed Divorced Separated
## Levels: Divorced Married Never-married Separated Widowed
# Create indicator variables
Adults$Race_white <- Adults$Race_black <- Adults$Race_as.pac.is <- Adults$Race_am.in.esk <- Adults$Class_gov <- Adults$Class_self <- Adults$Class_priv <- Adults$Ms_marr <- Adults$Ms_div <- Adults$Ms_sep <- Adults$Ms_wid <-
Adults$Income_g50K <- Adults$GenderMale <- c(rep(0, length(Adults$Income)))
for (i in 1:length(Adults$Income)) {
if(Adults$Income[i]==">50K") Adults$Income_g50K[i] <- 1
if(Adults$Gender[i] == "Male") Adults$GenderMale[i] <- 1
if(Adults$Race[i] == "White") Adults$Race_white[i] <- 1
if(Adults$Race[i] == "Amer-Indian-Eskimo") Adults$Race_am.in.esk[i] <- 1
if(Adults$Race[i] == "Asian-Pac-Islander") Adults$Race_as.pac.is[i] <- 1
if(Adults$Race[i] == "Black") Adults$Race_black[i] <- 1
if(Adults$Class[i] == "Gov") Adults$Class_gov[i] <- 1
if(Adults$Class[i] == "Self") Adults$Class_self[i] <- 1
if(Adults$Class[i] == "Private" ) Adults$Class_priv[i] <- 1
if(Adults$Marital[i] == "Married") Adults$Ms_marr[i] <- 1
if(Adults$Marital[i] == "Divorced" ) Adults$Ms_div[i] <- 1
if(Adults$Marital[i] == "Separated" ) Adults$Ms_sep[i] <- 1
if(Adults$Marital[i] == "Widowed" ) Adults$Ms_wid[i] <- 1
}
# Minimax transform the continuous variables
Adults$Age_mm <- (Adults$Age - min(Adults$Age)) / (max(Adults$Age)-min(Adults$Age))
Adults$EduNum_mm <- (Adults$EduNum - min(Adults$EduNum)) / (max(Adults$EduNum)-min(Adults$EduNum))
Adults$CapGain_mm <- (Adults$CapGain - min(Adults$CapGain)) / (max(Adults$CapGain)- min(Adults$CapGain))
Adults$CapLoss_mm <- (Adults$CapLoss - min(Adults$CapLoss)) / (max(Adults$CapLoss)- min(Adults$CapLoss))
Adults$WeekHours_mm <- (Adults$WeekHours - min(Adults$WeekHours)) / (max(Adults$WeekHours)-min(Adults$WeekHours))
# Random sampling
samplesize = 0.60 * nrow(Adults)
set.seed(80)
index = sample( seq_len ( nrow ( Adults ) ), size = samplesize )
# Create training and test set and get rid of the variables we no longer need
AdultsTrain = Adults[ index, -c(1:16)]
AdultsTest = Adults[ -index, -c(1:16)]
# Write data to working directory
write.csv(AdultsTrain, file = "AdultsTrain.csv")
write.csv(AdultsTest, file = "AdultsTest.csv")
# Run the neural net
library(nnet) # Requires package nnet
net.dat <- nnet(Income_g50K ~ Age_mm + EduNum_mm + CapLoss_mm + CapGain_mm + WeekHours_mm, data = AdultsTrain, size = 8)
## # weights: 57
## initial value 176.280853
## final value 106.000000
## converged
table(round(net.dat$fitted.values, 1)) # If fitted values are all the same, rerun nnet
##
## 0
## 480
net.dat$wts # Weights
## [1] 7.463078 2.379653 4.017865 0.195727 -0.007822 1.938247
## [7] 9.429044 2.279962 4.948651 0.384534 -0.196695 4.193843
## [13] -9.248702 -2.201697 -5.798401 0.084257 -0.603092 -3.389351
## [19] -12.119954 -3.052037 -6.870991 0.052914 0.133593 -4.163492
## [25] -4.079619 -1.886951 -2.275251 -0.676292 -0.399636 -1.777567
## [31] -2.553090 -1.177578 -1.573706 -0.652283 -0.510835 -0.018080
## [37] -5.478672 -1.069325 -3.165955 0.190001 0.424434 -1.502311
## [43] -9.496203 -2.565723 -5.950743 0.457548 0.395052 -2.902762
## [49] -95.023291 -49.340489 -26.436925 -39.317858 -27.521989 -63.732206
## [55] -28.639980 -55.612601 -66.421641
# hist(net.dat$wts)
# Use Test Dataset to Predict Rating
predict_testNN = compute(NN3, AdultsTest[,c(13:17)]) # Columns used for model
AdultsTest$predict_testNN = (predict_testNN$net.result * (max(AdultsTest$Income_g50K) - min(AdultsTest$Income_g50K))) + min(AdultsTest$Income_g50K)
# Rounding values to make it easier to inspect
AdultsTest$predict_testNN <- round(AdultsTest$predict_testNN, 5)
# Show Scatter Plot of Predicted Versus Real Rating
plot(AdultsTest$Income_g50K, AdultsTest$predict_testNN, col='blue', pch = 20,
ylab = "Predicted Rating Using NN",
xlab = "Real Rating")
text(AdultsTest$Income_g50K, AdultsTest$predict_testNN, labels = AdultsTest$name, pos = 2)
# Load Library if not available
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
abline(0,1, col = "red")
# Calculate Root Mean Square Error (RMSE)
RMSE.NN = (sum((AdultsTest$Income_g50K - AdultsTest$predict_testNN)^2) / nrow(AdultsTest)) ^ 0.5
RMSE.NN
## [1] 0.4285