1 Load Libraries

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

## Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)

if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)

if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Global Settings
options(digits =   4)
options(scipen = 999)

# Setting working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Classification and Regression Tree Using CART

2.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)

# Input Data Set
train <- data.frame(
  ClaimID = c(1,2,3),
  RearEnd = c(TRUE, FALSE, TRUE),
  Fraud   = c(TRUE, FALSE, TRUE)
)

# Show Data
train
##   ClaimID RearEnd Fraud
## 1       1    TRUE  TRUE
## 2       2   FALSE FALSE
## 3       3    TRUE  TRUE
# Write data to working directory
write.csv(train, file = "ClaimTrain.csv")

2.2 Calculate Decision Tree

# Calculate the Tree
mytree <- rpart(
  Fraud ~ RearEnd, 
  data = train, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(mytree)
## n= 3 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3 1 TRUE (0.3333 0.6667)  
##   2) RearEnd< 0.5 1 0 FALSE (1.0000 0.0000) *
##   3) RearEnd>=0.5 2 0 TRUE (0.0000 1.0000) *
# Plot the Tree
cat("\n\nThe following information is shown in each node:\n")
## 
## 
## The following information is shown in each node:
cat("- Node Number\n")
## - Node Number
cat("- Predicted class\n")
## - Predicted class
cat("- Predicted probability of this class\n")
## - Predicted probability of this class
cat("- Percentage of Observations in the node\n")
## - Percentage of Observations in the node
rpart.plot(mytree, main = "Classification Tree To Identify Insurance Fraud", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

2.3 Get Data for Predicting Whether To Play Tennis

# Download Data from URL

2.4 Use Predictors Whether To Play Tennis to Develop Decision Tree - Using Recursive Partitioning (RPart)

# Change Play Tennis Column
PlayTennis$PlayTennis <- ifelse(PlayTennis$PlayTennis == "Yes", "Play", "Don't Play")

# Show Data
PlayTennis
##    Day  Outlook Temperature Humidity   Wind PlayTennis
## 1    1    Sunny         Hot     High   Weak Don't Play
## 2    2    Sunny         Hot     High Strong Don't Play
## 3    3 Overcast         Hot     High   Weak       Play
## 4    4    Rainy        Mild     High   Weak       Play
## 5    5    Rainy        Cool   Normal   Weak       Play
## 6    6    Rainy        Cool   Normal Strong Don't Play
## 7    7 Overcast        Cool   Normal Strong       Play
## 8    8    Sunny        Mild     High   Weak Don't Play
## 9    9    Sunny        Cool   Normal   Weak       Play
## 10  10    Rainy        Mild   Normal   Weak       Play
## 11  11    Sunny        Mild   Normal Strong       Play
## 12  12 Overcast        Mild     High Strong       Play
## 13  13 Overcast         Hot   Normal   Weak       Play
## 14  14    Rainy        Mild     High Strong Don't Play
# Calculate the Tree
TennisTree <- rpart(
  PlayTennis ~ Outlook + Temperature + Wind + Humidity, 
  data = PlayTennis, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(TennisTree)
## n= 14 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 14 5 Play (0.3571 0.6429)  
##    2) Outlook=Rainy,Sunny 10 5 Don't Play (0.5000 0.5000)  
##      4) Humidity=High 5 1 Don't Play (0.8000 0.2000)  
##        8) Outlook=Sunny 3 0 Don't Play (1.0000 0.0000) *
##        9) Outlook=Rainy 2 1 Don't Play (0.5000 0.5000)  
##         18) Wind=Strong 1 0 Don't Play (1.0000 0.0000) *
##         19) Wind=Weak 1 0 Play (0.0000 1.0000) *
##      5) Humidity=Normal 5 1 Play (0.2000 0.8000)  
##       10) Wind=Strong 2 1 Don't Play (0.5000 0.5000)  
##         20) Outlook=Rainy 1 0 Don't Play (1.0000 0.0000) *
##         21) Outlook=Sunny 1 0 Play (0.0000 1.0000) *
##       11) Wind=Weak 3 0 Play (0.0000 1.0000) *
##    3) Outlook=Overcast 4 0 Play (0.0000 1.0000) *
# Plot the Tree
rpart.plot(TennisTree, 
           shadow.col = "gray", 
           nn = TRUE,
           main = "Classification Tree To Decide to Play Tennis")

2.5 Get Data to Determine Whether Customer Should be Classified as Credit Risk

# Input Data Set
RiskData <- data.frame(
  Customer = c(1, 2, 3, 4, 5, 6, 7, 8),
  Savings  = c("Medium", "Low", "High", "Medium", "Low", "High", "Low", "Medium"),
  Assets   = c("High", "Low", "Medium", "Medium", "Medium", "High", "Low", "Medium"),
  IncomeK  = c(75, 50, 25, 50, 100, 25, 25, 75),
  Risk     = c("Good", "Bad", "Bad", "Good","Good", "Good", "Bad", "Good")
)

# Display Data Table
RiskData
##   Customer Savings Assets IncomeK Risk
## 1        1  Medium   High      75 Good
## 2        2     Low    Low      50  Bad
## 3        3    High Medium      25  Bad
## 4        4  Medium Medium      50 Good
## 5        5     Low Medium     100 Good
## 6        6    High   High      25 Good
## 7        7     Low    Low      25  Bad
## 8        8  Medium Medium      75 Good
# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)

# Load Library if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Explore Data
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
sumtable(RiskData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##      Variable N   Mean     SD Min  P25 P50  P75 Max NA Mode
## 1    Customer 8    4.5 2.4495   1 2.75 4.5 6.25   8  0     
## 2     Savings 8                                            
## 3    ... High 2    25%                                     
## 4     ... Low 3  37.5%                                     
## 5  ... Medium 3  37.5%                                     
## 6      Assets 8                                            
## 7    ... High 2    25%                                     
## 8     ... Low 2    25%                                     
## 9  ... Medium 4    50%                                     
## 10    IncomeK 8 53.125  28.15  25   25  50   75 100  0     
## 11       Risk 8                                            
## 12    ... Bad 3  37.5%                                     
## 13   ... Good 5  62.5%

2.6 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk

# Calculate the Tree
RiskTree <- rpart(
  Risk ~ Savings + Assets + IncomeK, 
  data = RiskData, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(RiskTree)
## n= 8 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 8 3 Good (0.3750 0.6250)  
##    2) Assets=Low 2 0 Bad (1.0000 0.0000) *
##    3) Assets=High,Medium 6 1 Good (0.1667 0.8333)  
##      6) Savings=High 2 1 Bad (0.5000 0.5000)  
##       12) Assets=Medium 1 0 Bad (1.0000 0.0000) *
##       13) Assets=High 1 0 Good (0.0000 1.0000) *
##      7) Savings=Low,Medium 4 0 Good (0.0000 1.0000) *
# Plot the Tree
rpart.plot(RiskTree, 
           shadow.col = "gray", 
           nn = TRUE,
           main = "Classification Tree To Identify Credit Risk")

3 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

3.1 Calculate Decision Tree Using Partykit

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)
# Calculate IncomeTree
TennisTree <- ctree(PlayTennis ~ Outlook + Humidity + Wind + Temperature, data = PlayTennis, 
                    minbucket = 0, 
                    minsplit = 1,
                    testtype = "Teststatistic",
                    mincriterion = 0, 
                    weights = NULL)

TennisTree
## 
## Model formula:
## PlayTennis ~ Outlook + Humidity + Wind + Temperature
## 
## Fitted party:
## [1] root
## |   [2] Outlook in Overcast: Yes (n = 4, err = 0%)
## |   [3] Outlook in Rainy, Sunny
## |   |   [4] Humidity in High
## |   |   |   [5] Outlook in Rainy
## |   |   |   |   [6] Wind in Strong: No (n = 1, err = 0%)
## |   |   |   |   [7] Wind in Weak: Yes (n = 1, err = 0%)
## |   |   |   [8] Outlook in Sunny: No (n = 3, err = 0%)
## |   |   [9] Humidity in Normal
## |   |   |   [10] Wind in Strong
## |   |   |   |   [11] Temperature in Cool: No (n = 1, err = 0%)
## |   |   |   |   [12] Temperature in Mild: Yes (n = 1, err = 0%)
## |   |   |   [13] Wind in Weak: Yes (n = 3, err = 0%)
## 
## Number of inner nodes:    6
## Number of terminal nodes: 7

3.2 Plot Decision Tree to Play Tennis Using Partykit

# Load Library if not available
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(TennisTree, main="Classification Tree: Are the Conditions Right to Play Tennis today?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(TennisTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(TennisTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

3.3 Show Importance of Features

# Load Library if not available
if(! "vip" %in% installed.packages()) { install.packages("vip", dependencies = TRUE) }
library(vip)

# Create a variable importance plot
var_importance <- vip::vip(TennisTree, num_features = 4)
print(var_importance)

4 Classification and Regression Tree Using CART

4.1 Get Data for Predicting Whether Income is < 50k

# Download Data from URL

4.2 Prepare Data

# Collapse Categrories
levels(Adults$marital.status)[2:4] <- "Married"
levels(Adults$workclass)[c(2,3,8)] <- "Gov"
levels(Adults$workclass)[c(5, 6)]  <- "Self"


# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Prepare Columns
names(Adults) <- str_to_title(names(Adults))
names(Adults) <- substr(names(Adults), 1, 6)

colnames(Adults)[2]  <- "Class"
colnames(Adults)[5]  <- "EduNum"
colnames(Adults)[6]  <- "Marital"
colnames(Adults)[7]  <- "Occupat"
colnames(Adults)[8]  <- "Relationship"
colnames(Adults)[11] <- "CapGain"
colnames(Adults)[12] <- "CapLoss"
colnames(Adults)[13] <- "WeekHours"

head(Adults, 4)
##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native Income
## 1    Own-child Black   Male       0       0        40 United-States  <=50K
## 2      Husband White   Male       0       0        50 United-States  <=50K
## 3      Husband White   Male       0       0        40 United-States   >50K
## 4      Husband Black   Male    7688       0        40 United-States   >50K
# Write data to working directory
write.csv(Adults, file = "Adults.csv") 

4.3 Explore Data Frame

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
## 
## 
## Columns Available in Data Frame:
names(Adults)
##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"
cat("\n\nShow Structure of the Data Frame:\n")
## 
## 
## Show Structure of the Data Frame:
str(Adults)
## 'data.frame':    48842 obs. of  15 variables:
##  $ Age         : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ Class       : Factor w/ 6 levels "?","Gov","Never-worked",..: 4 4 2 4 1 4 1 5 4 4 ...
##  $ Fnlwgt      : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ Educat      : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ EduNum      : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ Marital     : Factor w/ 5 levels "Divorced","Married",..: 3 2 2 2 3 3 3 2 3 2 ...
##  $ Occupat     : Factor w/ 15 levels "?","Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ Relationship: Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ Race        : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ Gender      : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ CapGain     : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ CapLoss     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ WeekHours   : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Native      : Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 40 ...
##  $ Income      : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
## 
## 
## First 5 Rows of Data Frame:
head(Adults, 5)
##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
## 5  18       ? 103497 Some-college     10 Never-married                 ?
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native Income
## 1    Own-child Black   Male       0       0        40 United-States  <=50K
## 2      Husband White   Male       0       0        50 United-States  <=50K
## 3      Husband White   Male       0       0        40 United-States   >50K
## 4      Husband Black   Male    7688       0        40 United-States   >50K
## 5    Own-child White Female       0       0        30 United-States  <=50K
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(Adults, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##                           Variable     N    Mean     SD   Min    P25    P50
## 1                              Age 48842  38.644 13.711    17     28     37
## 2                            Class 48842                                   
## 3                            ... ?  2799  5.731%                           
## 4                          ... Gov  6549 13.409%                           
## 5                 ... Never-worked    10   0.02%                           
## 6                      ... Private 33906  69.42%                           
## 7                         ... Self  5557 11.378%                           
## 8                  ... Without-pay    21  0.043%                           
## 9                           Fnlwgt 48842  189664 105604 12285 117551 178145
## 10                          Educat 48842                                   
## 11                        ... 10th  1389  2.844%                           
## 12                        ... 11th  1812   3.71%                           
## 13                        ... 12th   657  1.345%                           
## 14                     ... 1st-4th   247  0.506%                           
## 15                     ... 5th-6th   509  1.042%                           
## 16                     ... 7th-8th   955  1.955%                           
## 17                         ... 9th   756  1.548%                           
## 18                  ... Assoc-acdm  1601  3.278%                           
## 19                   ... Assoc-voc  2061   4.22%                           
## 20                   ... Bachelors  8025 16.431%                           
## 21                   ... Doctorate   594  1.216%                           
## 22                     ... HS-grad 15784 32.316%                           
## 23                     ... Masters  2657   5.44%                           
## 24                   ... Preschool    83   0.17%                           
## 25                 ... Prof-school   834  1.708%                           
## 26                ... Some-college 10878 22.272%                           
## 27                          EduNum 48842  10.078  2.571     1      9     10
## 28                         Marital 48842                                   
## 29                    ... Divorced  6633 13.581%                           
## 30                     ... Married 23044 47.181%                           
## 31               ... Never-married 16117 32.998%                           
## 32                   ... Separated  1530  3.133%                           
## 33                     ... Widowed  1518  3.108%                           
## 34                         Occupat 48842                                   
## 35                           ... ?  2809  5.751%                           
## 36                ... Adm-clerical  5611 11.488%                           
## 37                ... Armed-Forces    15  0.031%                           
## 38                ... Craft-repair  6112 12.514%                           
## 39             ... Exec-managerial  6086 12.461%                           
## 40             ... Farming-fishing  1490  3.051%                           
## 41           ... Handlers-cleaners  2072  4.242%                           
## 42           ... Machine-op-inspct  3022  6.187%                           
## 43               ... Other-service  4923 10.079%                           
## 44             ... Priv-house-serv   242  0.495%                           
## 45              ... Prof-specialty  6172 12.637%                           
## 46             ... Protective-serv   983  2.013%                           
## 47                       ... Sales  5504 11.269%                           
## 48                ... Tech-support  1446  2.961%                           
## 49            ... Transport-moving  2355  4.822%                           
## 50                    Relationship 48842                                   
## 51                     ... Husband 19716 40.367%                           
## 52               ... Not-in-family 12583 25.763%                           
## 53              ... Other-relative  1506  3.083%                           
## 54                   ... Own-child  7581 15.521%                           
## 55                   ... Unmarried  5125 10.493%                           
## 56                        ... Wife  2331  4.773%                           
## 57                            Race 48842                                   
## 58          ... Amer-Indian-Eskimo   470  0.962%                           
## 59          ... Asian-Pac-Islander  1519   3.11%                           
## 60                       ... Black  4685  9.592%                           
## 61                       ... Other   406  0.831%                           
## 62                       ... White 41762 85.504%                           
## 63                          Gender 48842                                   
## 64                      ... Female 16192 33.152%                           
## 65                        ... Male 32650 66.848%                           
## 66                         CapGain 48842  1079.1   7452     0      0      0
## 67                         CapLoss 48842  87.502    403     0      0      0
## 68                       WeekHours 48842  40.422 12.391     1     40     40
## 69                          Native 48842                                   
## 70                           ... ?   857  1.755%                           
## 71                    ... Cambodia    28  0.057%                           
## 72                      ... Canada   182  0.373%                           
## 73                       ... China   122   0.25%                           
## 74                    ... Columbia    85  0.174%                           
## 75                        ... Cuba   138  0.283%                           
## 76          ... Dominican-Republic   103  0.211%                           
## 77                     ... Ecuador    45  0.092%                           
## 78                 ... El-Salvador   155  0.317%                           
## 79                     ... England   127   0.26%                           
## 80                      ... France    38  0.078%                           
## 81                     ... Germany   206  0.422%                           
## 82                      ... Greece    49    0.1%                           
## 83                   ... Guatemala    88   0.18%                           
## 84                       ... Haiti    75  0.154%                           
## 85          ... Holand-Netherlands     1  0.002%                           
## 86                    ... Honduras    20  0.041%                           
## 87                        ... Hong    30  0.061%                           
## 88                     ... Hungary    19  0.039%                           
## 89                       ... India   151  0.309%                           
## 90                        ... Iran    59  0.121%                           
## 91                     ... Ireland    37  0.076%                           
## 92                       ... Italy   105  0.215%                           
## 93                     ... Jamaica   106  0.217%                           
## 94                       ... Japan    92  0.188%                           
## 95                        ... Laos    23  0.047%                           
## 96                      ... Mexico   951  1.947%                           
## 97                   ... Nicaragua    49    0.1%                           
## 98  ... Outlying-US(Guam-USVI-etc)    23  0.047%                           
## 99                        ... Peru    46  0.094%                           
## 100                ... Philippines   295  0.604%                           
## 101                     ... Poland    87  0.178%                           
## 102                   ... Portugal    67  0.137%                           
## 103                ... Puerto-Rico   184  0.377%                           
## 104                   ... Scotland    21  0.043%                           
## 105                      ... South   115  0.235%                           
## 106                     ... Taiwan    65  0.133%                           
## 107                   ... Thailand    30  0.061%                           
## 108            ... Trinadad&Tobago    27  0.055%                           
## 109              ... United-States 43832 89.742%                           
## 110                    ... Vietnam    86  0.176%                           
## 111                 ... Yugoslavia    23  0.047%                           
## 112                         Income 48842                                   
## 113                      ... <=50K 37155 76.072%                           
## 114                       ... >50K 11687 23.928%                           
##        P75     Max NA Mode
## 1       48      90  0     
## 2                         
## 3                         
## 4                         
## 5                         
## 6                         
## 7                         
## 8                         
## 9   237642 1490400  0     
## 10                        
## 11                        
## 12                        
## 13                        
## 14                        
## 15                        
## 16                        
## 17                        
## 18                        
## 19                        
## 20                        
## 21                        
## 22                        
## 23                        
## 24                        
## 25                        
## 26                        
## 27      12      16  0     
## 28                        
## 29                        
## 30                        
## 31                        
## 32                        
## 33                        
## 34                        
## 35                        
## 36                        
## 37                        
## 38                        
## 39                        
## 40                        
## 41                        
## 42                        
## 43                        
## 44                        
## 45                        
## 46                        
## 47                        
## 48                        
## 49                        
## 50                        
## 51                        
## 52                        
## 53                        
## 54                        
## 55                        
## 56                        
## 57                        
## 58                        
## 59                        
## 60                        
## 61                        
## 62                        
## 63                        
## 64                        
## 65                        
## 66       0   99999  0     
## 67       0    4356  0     
## 68      45      99  0     
## 69                        
## 70                        
## 71                        
## 72                        
## 73                        
## 74                        
## 75                        
## 76                        
## 77                        
## 78                        
## 79                        
## 80                        
## 81                        
## 82                        
## 83                        
## 84                        
## 85                        
## 86                        
## 87                        
## 88                        
## 89                        
## 90                        
## 91                        
## 92                        
## 93                        
## 94                        
## 95                        
## 96                        
## 97                        
## 98                        
## 99                        
## 100                       
## 101                       
## 102                       
## 103                       
## 104                       
## 105                       
## 106                       
## 107                       
## 108                       
## 109                       
## 110                       
## 111                       
## 112                       
## 113                       
## 114

4.4 Standardise Numeric Variables

# Standardise Numeric Variables
Adults$Age.z       <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
Adults$EduNum.z    <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
Adults$CapGain.z   <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
Adults$CapLoss.z   <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Adults)
##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"      
## [16] "Age.z"        "EduNum.z"     "CapGain.z"    "CapLoss.z"    "WeekHours.z"

4.5 Use Predictors to Classify Whether or not a Person’s Income is less than $50K

# Calculate the Income Tree
IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, method = "class", minsplit = 2, minbucket = 1)
print(IncomeTree)
## n= 48842 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 48842 11690 <=50K (0.76072 0.23928)  
##    2) Marital=Divorced,Never-married,Separated,Widowed 25798  1631 <=50K (0.93678 0.06322)  
##      4) CapGain.z< 0.802 25352  1200 <=50K (0.95267 0.04733) *
##      5) CapGain.z>=0.802 446    15 >50K (0.03363 0.96637) *
##    3) Marital=Married 23044 10060 <=50K (0.56362 0.43638)  
##      6) EduNum.z< 0.942 16234  5229 <=50K (0.67790 0.32210)  
##       12) CapGain.z< 0.539 15465  4475 <=50K (0.71064 0.28936)  
##         24) EduNum.z< -0.6138 2672   266 <=50K (0.90045 0.09955) *
##         25) EduNum.z>=-0.6138 12793  4209 <=50K (0.67099 0.32901)  
##           50) CapLoss.z< 4.3 12298  3833 <=50K (0.68832 0.31168) *
##           51) CapLoss.z>=4.3 495   119 >50K (0.24040 0.75960) *
##       13) CapGain.z>=0.539 769    15 >50K (0.01951 0.98049) *
##      7) EduNum.z>=0.942 6810  1983 >50K (0.29119 0.70881) *

4.6 Plot Decision Tree using Cart

# Plot the Decsion Tree
cat("\n\nThe following information is shown in each node:\n")
## 
## 
## The following information is shown in each node:
cat("- Node Number\n")
## - Node Number
cat("- Predicted class\n")
## - Predicted class
cat("- Predicted probability of this class\n")
## - Predicted probability of this class
cat("- Percentage of Observations in the node\n")
## - Percentage of Observations in the node
rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

5 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

5.1 Calculate Decision Tree Using Partykit and CTREE

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)

# Load Library if not available
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Calculate IncomeTree
IncomeTree <- ctree(Income ~ Age + EduNum + CapGain + CapLoss + WeekHours + Race + Gender + Class + Marital, data = Adults, weights = NULL,
  minsplit  = 12,
  maxdepth  =  3,
  teststat  = "quadratic",
  testtype  = "Bonferroni",
  minbucket = 10,
  mincriterion = 0.95,)

# Calculate IncomeTree from Normalised Values
IncomeTree.z <- ctree(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, weights = NULL,
  minsplit  = 12,
  maxdepth  =  3,
  teststat  = "quadratic",
  testtype  = "Bonferroni",
  minbucket = 10,
  mincriterion = 0.95,)

print(IncomeTree)
## 
## Model formula:
## Income ~ Age + EduNum + CapGain + CapLoss + WeekHours + Race + 
##     Gender + Class + Marital
## 
## Fitted party:
## [1] root
## |   [2] Marital in Divorced, Never-married, Separated, Widowed
## |   |   [3] CapGain <= 6849
## |   |   |   [4] EduNum <= 13: <=50K (n = 23975, err = 4%)
## |   |   |   [5] EduNum > 13: <=50K (n = 1377, err = 25%)
## |   |   [6] CapGain > 6849
## |   |   |   [7] WeekHours <= 35: >50K (n = 46, err = 20%)
## |   |   |   [8] WeekHours > 35: >50K (n = 400, err = 2%)
## |   [9] Marital in Married
## |   |   [10] EduNum <= 12
## |   |   |   [11] EduNum <= 8: <=50K (n = 2726, err = 12%)
## |   |   |   [12] EduNum > 8: <=50K (n = 13508, err = 36%)
## |   |   [13] EduNum > 12
## |   |   |   [14] WeekHours <= 31: <=50K (n = 567, err = 44%)
## |   |   |   [15] WeekHours > 31: >50K (n = 6243, err = 27%)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8
print(IncomeTree.z)
## 
## Model formula:
## Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + 
##     Race + Gender + Class + Marital
## 
## Fitted party:
## [1] root
## |   [2] Marital in Divorced, Never-married, Separated, Widowed
## |   |   [3] CapGain.z <= 0.77
## |   |   |   [4] EduNum.z <= 1.14: <=50K (n = 23975, err = 4%)
## |   |   |   [5] EduNum.z > 1.14: <=50K (n = 1377, err = 25%)
## |   |   [6] CapGain.z > 0.77
## |   |   |   [7] WeekHours.z <= -0.44: >50K (n = 46, err = 20%)
## |   |   |   [8] WeekHours.z > -0.44: >50K (n = 400, err = 2%)
## |   [9] Marital in Married
## |   |   [10] EduNum.z <= 0.75
## |   |   |   [11] EduNum.z <= -0.81: <=50K (n = 2726, err = 12%)
## |   |   |   [12] EduNum.z > -0.81: <=50K (n = 13508, err = 36%)
## |   |   [13] EduNum.z > 0.75
## |   |   |   [14] WeekHours.z <= -0.76: <=50K (n = 567, err = 44%)
## |   |   |   [15] WeekHours.z > -0.76: >50K (n = 6243, err = 27%)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8

5.2 Plot Decision Tree Using Partykit

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

p <- plot(IncomeTree, main="Classification Tree: Is a Person Likely to Earn 50k?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(IncomeTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(IncomeTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

# Save the plot
ggsave(filename = paste("Is a Person Likely to Earn 50k", ".png", sep = ""), plot = p, width = 12, height = 8) 

6 Classification and Regression Tree Using Conditional Inference Tree (C5.0, C5.0 Method)

6.1 Calculate Decision Tree Using C5.0

# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)

# Make Variables
Outcome <- as.factor(Adults$Income)
Factors <- data.frame(Adults[, c(1, 2, 4, 6, 9, 10, 13, 14)])

# Calculate IncomeTree from Observed Values
IncomeTree <- C5.0(y = Outcome, x = Factors, rules = FALSE, control = C5.0Control(
  subset = FALSE,
  bands = 0,
  winnow = FALSE,
  noGlobalPruning = FALSE,
  CF = 0.05,
  minCases = 50,
  fuzzyThreshold = FALSE,
  seed = sample.int(4096, size = 1) - 1L,
  earlyStopping = TRUE,
  label = "outcome"))

# Calculate IncomeTree from Normalised Values
#IncomeTree.z <- C5.0(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, rules = FALSE)

summary(IncomeTree)
## 
## Call:
## C5.0.default(x = Factors, y = Outcome, rules = FALSE, control
##  = FALSE, CF = 0.05, minCases = 50, fuzzyThreshold = FALSE, seed
##  = sample.int(4096, size = 1) - 1L, earlyStopping = TRUE, label = "outcome"))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:14 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 48842 cases (9 attributes) from undefined.data
## 
## Decision tree:
## 
## Age <= 27: <=50K (12012/369)
## Age > 27:
## :...Marital in {Divorced,Never-married,Separated,
##     :           Widowed}: <=50K (15612/1529)
##     Marital = Married:
##     :...Educat in {10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-voc,
##         :          HS-grad,Preschool}: <=50K (10153/2939)
##         Educat in {Doctorate,Masters,Prof-school}: >50K (2539/530)
##         Educat = Assoc-acdm:
##         :...Age <= 35: <=50K (180/68)
##         :   Age > 35: >50K (482/206)
##         Educat = Bachelors:
##         :...WeekHours <= 31: <=50K (323/135)
##         :   WeekHours > 31: >50K (3682/1094)
##         Educat = Some-college:
##         :...WeekHours <= 34: <=50K (353/92)
##             WeekHours > 34:
##             :...Class in {?,Never-worked}: >50K (0)
##                 Class in {Self,Without-pay}: <=50K (615.9/271)
##                 Class = Gov:
##                 :...Age <= 38: <=50K (185/71.6)
##                 :   Age > 38: >50K (364.7/157.7)
##                 Class = Private:
##                 :...WeekHours > 41:
##                     :...Age <= 37: <=50K (348.3/139)
##                     :   Age > 37: >50K (608/235.3)
##                     WeekHours <= 41:
##                     :...Age > 61: <=50K (65/25)
##                         Age <= 61:
##                         :...Age <= 47: <=50K (982.7/403)
##                             Age > 47: >50K (336.3/143.7)
## 
## 
## Evaluation on training data (48842 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      17 8408(17.2%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##   34787  2368    (a): class <=50K
##    6040  5647    (b): class >50K
## 
## 
##  Attribute usage:
## 
##  100.00% Age
##   75.41% Marital
##   43.44% Educat
##   16.10% WeekHours
##    7.04% Class
## 
## 
## Time: 0.1 secs
#summary(IncomeTree.z)

6.2 Plot Decision Tree Using C5.0

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(IncomeTree, main="Classification Tree: Is a Person Likely to Earn 50k or more?", cex.main = 1.5, 
      gp = gpar(fontsize = 15),
      drop_terminal = FALSE, tnex = 1,
      )

6.3 Get Data to Determine Whether Customer Should be Classified as Credit Risk

# Input Data Set
RiskData <- data.frame(
  Customer = c(1, 2, 3, 4, 5, 6, 7, 8),
  Savings  = c("Medium", "Low", "High", "Medium", "Low", "High", "Low", "Medium"),
  Assets   = c("High", "Low", "Medium", "Medium", "Medium", "High", "Low", "Medium"),
  IncomeK  = c(75, 50, 25, 50, 100, 25, 25, 75),
  Risk     = c("Good", "Bad", "Bad", "Good","Good", "Good", "Bad", "Good")
)

# Display Data Table
RiskData
##   Customer Savings Assets IncomeK Risk
## 1        1  Medium   High      75 Good
## 2        2     Low    Low      50  Bad
## 3        3    High Medium      25  Bad
## 4        4  Medium Medium      50 Good
## 5        5     Low Medium     100 Good
## 6        6    High   High      25 Good
## 7        7     Low    Low      25  Bad
## 8        8  Medium Medium      75 Good
# Explore Data
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
sumtable(RiskData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##      Variable N   Mean     SD Min  P25 P50  P75 Max NA Mode
## 1    Customer 8    4.5 2.4495   1 2.75 4.5 6.25   8  0     
## 2     Savings 8                                            
## 3    ... High 2    25%                                     
## 4     ... Low 3  37.5%                                     
## 5  ... Medium 3  37.5%                                     
## 6      Assets 8                                            
## 7    ... High 2    25%                                     
## 8     ... Low 2    25%                                     
## 9  ... Medium 4    50%                                     
## 10    IncomeK 8 53.125  28.15  25   25  50   75 100  0     
## 11       Risk 8                                            
## 12    ... Bad 3  37.5%                                     
## 13   ... Good 5  62.5%
# Write data to working directory
write.csv(RiskData, file = "RiskData.csv")

6.4 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

# Make Variables
Outcome <- as.factor(RiskData$Risk)
Factors <- data.frame(RiskData[, c(2, 3, 4)])

# Calculate IncomeTree from Observed Values
RiskTree <- C5.0(Outcome ~ Savings + Assets + IncomeK, data = RiskData, rules = FALSE)

summary(RiskTree)
## 
## Call:
## C5.0.formula(formula = Outcome ~ Savings + Assets + IncomeK, data =
##  RiskData, rules = FALSE)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:15 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 8 cases (4 attributes) from undefined.data
## 
## Decision tree:
## 
## Assets in {High,Medium}: Good (6/1)
## Assets = Low: Bad (2)
## 
## 
## Evaluation on training data (8 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    1(12.5%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##       2     1    (a): class Bad
##             5    (b): class Good
## 
## 
##  Attribute usage:
## 
##  100.00% Assets
## 
## 
## Time: 0.0 secs

6.5 Plot Decision Tree to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

# Plot the Decision Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(RiskTree, main="Classification Tree: Is a Person Likely to be Credit Risk?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      )

7 Assignment

7.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)

# Set number of decimals
options(digits = 4)              # Modify global options

7.2 Get Data for Predicting Whether Going For Morning Walk

# Create the dataset
Exercise <- data.frame(
  Rain = c('No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes'),
  EarlyMeeting = c('No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes'),
  ExerciseEveningBefore = c('No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No'),
  GoForWalk = c('Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No')
)
##    Day Rain EarlyMeeting ExerciseEveningBefore GoForWalk
## 1    1   No           No                    No       Yes
## 2    2  Yes           No                    No        No
## 3    3  Yes          Yes                   Yes        No
## 4    4   No          Yes                   Yes        No
## 5    5   No          Yes                   Yes        No
## 6    6   No          Yes                   Yes        No
## 7    7   No           No                    No       Yes
## 8    8   No           No                   Yes       Yes
## 9    9  Yes          Yes                    No        No
## 10  10  Yes           No                    No        No
## 11  11  Yes           No                   Yes        No
## 12  12  Yes          Yes                    No        No
## 13  13   No           No                   Yes       Yes
## 14  14   No           No                   Yes       Yes
## 15  15   No          Yes                    No       Yes
## 16  16   No          Yes                    No       Yes

7.3 Calculate Decision Tree

# Calculate the Tree
control <- rpart.control(minsplit = 1, minbucket = 1, cp = 0.001, maxdepth = 30)
mytree <- rpart(
  GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore, data = Exercise,
  method = "class", 
  control = control
)
print(mytree)
## n= 16 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 16 7 No (0.5625 0.4375)  
##    2) Rain=Yes 6 0 No (1.0000 0.0000) *
##    3) Rain=No 10 3 Yes (0.3000 0.7000)  
##      6) EarlyMeeting=Yes 5 2 No (0.6000 0.4000)  
##       12) ExerciseEveningBefore=Yes 3 0 No (1.0000 0.0000) *
##       13) ExerciseEveningBefore=No 2 0 Yes (0.0000 1.0000) *
##      7) EarlyMeeting=No 5 0 Yes (0.0000 1.0000) *
# Plot the Tree
cat("\n\nThe following information is shown in each node:\n")
## 
## 
## The following information is shown in each node:
cat("- Node number\n")
## - Node number
cat("- Predicted class\n")
## - Predicted class
cat("- Predicted probability of this class\n")
## - Predicted probability of this class
cat("- Percentage of observations in the node\n")
## - Percentage of observations in the node
rpart.plot(mytree, main = "Classification Tree To Decide Going for Morning Walk", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

7.4 Calculate Decision Tree Using PartyKit

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Calculate ExerciseTree
ExerciseTree <- ctree(GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore, data = Exercise, 
     minsplit = 1, 
     minbucket = 1)

ExerciseTree
## 
## Model formula:
## GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore
## 
## Fitted party:
## [1] root
## |   [2] Rain in No: Yes (n = 10, err = 30%)
## |   [3] Rain in Yes: No (n = 6, err = 0%)
## 
## Number of inner nodes:    1
## Number of terminal nodes: 2

8 Plot Decision Tree Using Partykit

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(ExerciseTree, main="Classification Tree: Are we going For a Walk This Morning?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(ExerciseTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(ExerciseTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

8.1 Calculate Decision Tree Using C5.0

# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)

# Make Variables
Outcome <- as.factor(Exercise$GoForWalk)
Factors <- data.frame(
  Exercise$Rain, Exercise$EarlyMeeting, Exercise$ExerciseEveningBefore)

# Calculate ExerciseTree from Observed Values
ExerciseTree <- C5.0(y = Outcome, x = Factors, rules = FALSE, trials = 5, control = C5.0Control(
  subset = FALSE,
  bands = 0,
  winnow = FALSE,
  noGlobalPruning = FALSE,
  CF = 0.05,
  minCases = 1,
  fuzzyThreshold = FALSE,
  seed = sample.int(4096, size = 1) - 1L,
  earlyStopping = FALSE,
  label = "outcome"))

summary(ExerciseTree)
## 
## Call:
## C5.0.default(x = Factors, y = Outcome, trials = 5, rules = FALSE, control
##  = FALSE, CF = 0.05, minCases = 1, fuzzyThreshold = FALSE, seed
##  = sample.int(4096, size = 1) - 1L, earlyStopping = FALSE, label = "outcome"))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:17 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 16 cases (4 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (6)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (5)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (2)
##         Exercise.ExerciseEveningBefore = Yes: No (3)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## Exercise.Rain = No: Yes (7.5/2.2)
## Exercise.Rain = Yes: No (4.5)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (4.8)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (4)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (1.6)
##         Exercise.ExerciseEveningBefore = Yes: No (5.5)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
##  No (12/4.2)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (4.3)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (4.9)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (2)
##         Exercise.ExerciseEveningBefore = Yes: No (4.9)
## 
## 
## Evaluation on training data (16 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0      4    0( 0.0%)
##    1      2    3(18.8%)
##    2      4    0( 0.0%)
##    3      1    7(43.8%)
##    4      4    0( 0.0%)
## boost              0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##       9          (a): class No
##             7    (b): class Yes
## 
## 
##  Attribute usage:
## 
##  100.00% Exercise.Rain
##   62.50% Exercise.EarlyMeeting
##   31.25% Exercise.ExerciseEveningBefore
## 
## 
## Time: 0.0 secs

8.2 Plot Decision Tree Using C5.0

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(ExerciseTree, main="Classification Tree: Are we Going for a Walk this Morning?", cex.main = 1.5, 
      gp = gpar(fontsize = 15),
      drop_terminal = FALSE, tnex = 1,
      )

9 Classification Using Random Forest

9.1 Get Data and Library

# Load Library if not available
if(! "randomForest" %in% installed.packages()) { install.packages("randomForest", dependencies = TRUE) }
library(randomForest)

# Load the iris dataset
data(iris)

# View the first few rows of the dataset
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

9.2 Split the Data

# Set seed for reproducibility
set.seed(123)

# Split the data into training (70%) and testing (30%) sets
train_index <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]

9.3 Train Random Forest Model

# Train the random forest model
rf_model <- randomForest(Species ~ ., data = train_data, importance = TRUE, ntree = 100)

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = Species ~ ., data = train_data, importance = TRUE,      ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.76%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         36          0         0     0.00000
## versicolor      0         29         3     0.09375
## virginica       0          2        35     0.05405

9.4 Evaluate Model on Testing Data

# Make predictions on the testing set
predictions <- predict(rf_model, test_data)

# Create a confusion matrix
confusion_matrix <- table(predictions, test_data$Species)

# Print the confusion matrix
print(confusion_matrix)
##             
## predictions  setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         17         0
##   virginica       0          1        13
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 97.78 %"

9.5 Analyse Variable Importance

# View the importance of each variable
importance(rf_model)
##              setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
## Sepal.Length  2.485     4.1775     1.214                4.456            5.829
## Sepal.Width   1.725    -0.2708     3.803                3.246            1.586
## Petal.Length  9.533    13.3983    13.888               15.158           29.960
## Petal.Width  10.340    12.5554    13.980               15.020           31.708
# Plot variable importance
varImpPlot(rf_model)