1 Load Libraries
2 Classification and Regression Tree Using CART
3 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)
4 Classification and Regression Tree Using CART
5 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)
- 5.1 Calculate Decision Tree Using Partykit and CTREE
- 5.2 Plot Decision Tree Using Partykit
6 Classification and Regression Tree Using Conditional Inference Tree (C5.0, C5.0 Method)
7 Assignment
8 Plot Decision Tree Using Partykit
- 8.1 Calculate Decision Tree Using C5.0
- 8.2 Plot Decision Tree Using C5.0
9 Classification Using Random Forest

1 Load Libraries

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

## Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)

if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)

if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
library(car)

# Global Settings
options(digits =   4)
options(scipen = 999)

# Setting working Directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

2 Classification and Regression Tree Using CART

2.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)

# Input Data Set
train <- data.frame(
  ClaimID = c(1,2,3),
  RearEnd = c(TRUE, FALSE, TRUE),
  Fraud   = c(TRUE, FALSE, TRUE)
)

# Show Data
train

##   ClaimID RearEnd Fraud
## 1       1    TRUE  TRUE
## 2       2   FALSE FALSE
## 3       3    TRUE  TRUE

# Write data to working directory
write.csv(train, file = "ClaimTrain.csv")

2.2 Calculate Decision Tree

# Calculate the Tree
mytree <- rpart(
  Fraud ~ RearEnd, 
  data = train, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(mytree)

## n= 3 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3 1 TRUE (0.3333 0.6667)  
##   2) RearEnd< 0.5 1 0 FALSE (1.0000 0.0000) *
##   3) RearEnd>=0.5 2 0 TRUE (0.0000 1.0000) *

# Plot the Tree
cat("\n\nThe following information is shown in each node:\n")

## 
## 
## The following information is shown in each node:

cat("- Node Number\n")

## - Node Number

cat("- Predicted class\n")

## - Predicted class

cat("- Predicted probability of this class\n")

## - Predicted probability of this class

cat("- Percentage of Observations in the node\n")

## - Percentage of Observations in the node

rpart.plot(mytree, main = "Classification Tree To Identify Insurance Fraud", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

2.3 Get Data for Predicting Whether To Play Tennis

# Download Data from URL

2.4 Use Predictors Whether To Play Tennis to Develop Decision Tree - Using Recursive Partitioning (RPart)

# Change Play Tennis Column
PlayTennis$PlayTennis <- ifelse(PlayTennis$PlayTennis == "Yes", "Play", "Don't Play")

# Show Data
PlayTennis

##    Day  Outlook Temperature Humidity   Wind PlayTennis
## 1    1    Sunny         Hot     High   Weak Don't Play
## 2    2    Sunny         Hot     High Strong Don't Play
## 3    3 Overcast         Hot     High   Weak       Play
## 4    4    Rainy        Mild     High   Weak       Play
## 5    5    Rainy        Cool   Normal   Weak       Play
## 6    6    Rainy        Cool   Normal Strong Don't Play
## 7    7 Overcast        Cool   Normal Strong       Play
## 8    8    Sunny        Mild     High   Weak Don't Play
## 9    9    Sunny        Cool   Normal   Weak       Play
## 10  10    Rainy        Mild   Normal   Weak       Play
## 11  11    Sunny        Mild   Normal Strong       Play
## 12  12 Overcast        Mild     High Strong       Play
## 13  13 Overcast         Hot   Normal   Weak       Play
## 14  14    Rainy        Mild     High Strong Don't Play

# Calculate the Tree
TennisTree <- rpart(
  PlayTennis ~ Outlook + Temperature + Wind + Humidity, 
  data = PlayTennis, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(TennisTree)

## n= 14 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 14 5 Play (0.3571 0.6429)  
##    2) Outlook=Rainy,Sunny 10 5 Don't Play (0.5000 0.5000)  
##      4) Humidity=High 5 1 Don't Play (0.8000 0.2000)  
##        8) Outlook=Sunny 3 0 Don't Play (1.0000 0.0000) *
##        9) Outlook=Rainy 2 1 Don't Play (0.5000 0.5000)  
##         18) Wind=Strong 1 0 Don't Play (1.0000 0.0000) *
##         19) Wind=Weak 1 0 Play (0.0000 1.0000) *
##      5) Humidity=Normal 5 1 Play (0.2000 0.8000)  
##       10) Wind=Strong 2 1 Don't Play (0.5000 0.5000)  
##         20) Outlook=Rainy 1 0 Don't Play (1.0000 0.0000) *
##         21) Outlook=Sunny 1 0 Play (0.0000 1.0000) *
##       11) Wind=Weak 3 0 Play (0.0000 1.0000) *
##    3) Outlook=Overcast 4 0 Play (0.0000 1.0000) *

# Plot the Tree
rpart.plot(TennisTree, 
           shadow.col = "gray", 
           nn = TRUE,
           main = "Classification Tree To Decide to Play Tennis")

2.5 Get Data to Determine Whether Customer Should be Classified as Credit Risk

# Input Data Set
RiskData <- data.frame(
  Customer = c(1, 2, 3, 4, 5, 6, 7, 8),
  Savings  = c("Medium", "Low", "High", "Medium", "Low", "High", "Low", "Medium"),
  Assets   = c("High", "Low", "Medium", "Medium", "Medium", "High", "Low", "Medium"),
  IncomeK  = c(75, 50, 25, 50, 100, 25, 25, 75),
  Risk     = c("Good", "Bad", "Bad", "Good","Good", "Good", "Bad", "Good")
)

# Display Data Table
RiskData

##   Customer Savings Assets IncomeK Risk
## 1        1  Medium   High      75 Good
## 2        2     Low    Low      50  Bad
## 3        3    High Medium      25  Bad
## 4        4  Medium Medium      50 Good
## 5        5     Low Medium     100 Good
## 6        6    High   High      25 Good
## 7        7     Low    Low      25  Bad
## 8        8  Medium Medium      75 Good

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)

# Load Library if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Explore Data
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

sumtable(RiskData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##      Variable N   Mean     SD Min  P25 P50  P75 Max NA Mode
## 1    Customer 8    4.5 2.4495   1 2.75 4.5 6.25   8  0     
## 2     Savings 8                                            
## 3    ... High 2    25%                                     
## 4     ... Low 3  37.5%                                     
## 5  ... Medium 3  37.5%                                     
## 6      Assets 8                                            
## 7    ... High 2    25%                                     
## 8     ... Low 2    25%                                     
## 9  ... Medium 4    50%                                     
## 10    IncomeK 8 53.125  28.15  25   25  50   75 100  0     
## 11       Risk 8                                            
## 12    ... Bad 3  37.5%                                     
## 13   ... Good 5  62.5%

2.6 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk

# Calculate the Tree
RiskTree <- rpart(
  Risk ~ Savings + Assets + IncomeK, 
  data = RiskData, 
  method = "class",
  minsplit = 2, 
  minbucket = 1
)
print(RiskTree)

## n= 8 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 8 3 Good (0.3750 0.6250)  
##    2) Assets=Low 2 0 Bad (1.0000 0.0000) *
##    3) Assets=High,Medium 6 1 Good (0.1667 0.8333)  
##      6) Savings=High 2 1 Bad (0.5000 0.5000)  
##       12) Assets=Medium 1 0 Bad (1.0000 0.0000) *
##       13) Assets=High 1 0 Good (0.0000 1.0000) *
##      7) Savings=Low,Medium 4 0 Good (0.0000 1.0000) *

# Plot the Tree
rpart.plot(RiskTree, 
           shadow.col = "gray", 
           nn = TRUE,
           main = "Classification Tree To Identify Credit Risk")

3 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

3.1 Calculate Decision Tree Using Partykit

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)

# Calculate IncomeTree
TennisTree <- ctree(PlayTennis ~ Outlook + Humidity + Wind + Temperature, data = PlayTennis, 
                    minbucket = 0, 
                    minsplit = 1,
                    testtype = "Teststatistic",
                    mincriterion = 0, 
                    weights = NULL)

TennisTree

## 
## Model formula:
## PlayTennis ~ Outlook + Humidity + Wind + Temperature
## 
## Fitted party:
## [1] root
## |   [2] Outlook in Overcast: Yes (n = 4, err = 0%)
## |   [3] Outlook in Rainy, Sunny
## |   |   [4] Humidity in High
## |   |   |   [5] Outlook in Rainy
## |   |   |   |   [6] Wind in Strong: No (n = 1, err = 0%)
## |   |   |   |   [7] Wind in Weak: Yes (n = 1, err = 0%)
## |   |   |   [8] Outlook in Sunny: No (n = 3, err = 0%)
## |   |   [9] Humidity in Normal
## |   |   |   [10] Wind in Strong
## |   |   |   |   [11] Temperature in Cool: No (n = 1, err = 0%)
## |   |   |   |   [12] Temperature in Mild: Yes (n = 1, err = 0%)
## |   |   |   [13] Wind in Weak: Yes (n = 3, err = 0%)
## 
## Number of inner nodes:    6
## Number of terminal nodes: 7

3.2 Plot Decision Tree to Play Tennis Using Partykit

# Load Library if not available
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(TennisTree, main="Classification Tree: Are the Conditions Right to Play Tennis today?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(TennisTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(TennisTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

3.3 Show Importance of Features

# Load Library if not available
if(! "vip" %in% installed.packages()) { install.packages("vip", dependencies = TRUE) }
library(vip)

# Create a variable importance plot
var_importance <- vip::vip(TennisTree, num_features = 4)
print(var_importance)

4 Classification and Regression Tree Using CART

4.1 Get Data for Predicting Whether Income is < 50k

# Download Data from URL

4.2 Prepare Data

# Collapse Categrories
levels(Adults$marital.status)[2:4] <- "Married"
levels(Adults$workclass)[c(2,3,8)] <- "Gov"
levels(Adults$workclass)[c(5, 6)]  <- "Self"


# Load Library if not available
if(! "stringr" %in% installed.packages()) { install.packages("stringr", dependencies = TRUE) }
library(stringr)

# Prepare Columns
names(Adults) <- str_to_title(names(Adults))
names(Adults) <- substr(names(Adults), 1, 6)

colnames(Adults)[2]  <- "Class"
colnames(Adults)[5]  <- "EduNum"
colnames(Adults)[6]  <- "Marital"
colnames(Adults)[7]  <- "Occupat"
colnames(Adults)[8]  <- "Relationship"
colnames(Adults)[11] <- "CapGain"
colnames(Adults)[12] <- "CapLoss"
colnames(Adults)[13] <- "WeekHours"

head(Adults, 4)

##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native Income
## 1    Own-child Black   Male       0       0        40 United-States  <=50K
## 2      Husband White   Male       0       0        50 United-States  <=50K
## 3      Husband White   Male       0       0        40 United-States   >50K
## 4      Husband Black   Male    7688       0        40 United-States   >50K

# Write data to working directory
write.csv(Adults, file = "Adults.csv")

4.3 Explore Data Frame

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Adults)

##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Adults)

## 'data.frame':    48842 obs. of  15 variables:
##  $ Age         : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ Class       : Factor w/ 6 levels "?","Gov","Never-worked",..: 4 4 2 4 1 4 1 5 4 4 ...
##  $ Fnlwgt      : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ Educat      : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ EduNum      : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ Marital     : Factor w/ 5 levels "Divorced","Married",..: 3 2 2 2 3 3 3 2 3 2 ...
##  $ Occupat     : Factor w/ 15 levels "?","Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ Relationship: Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ Race        : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ Gender      : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ CapGain     : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ CapLoss     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ WeekHours   : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Native      : Factor w/ 42 levels "?","Cambodia",..: 40 40 40 40 40 40 40 40 40 40 ...
##  $ Income      : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Adults, 5)

##   Age   Class Fnlwgt       Educat EduNum       Marital           Occupat
## 1  25 Private 226802         11th      7 Never-married Machine-op-inspct
## 2  38 Private  89814      HS-grad      9       Married   Farming-fishing
## 3  28     Gov 336951   Assoc-acdm     12       Married   Protective-serv
## 4  44 Private 160323 Some-college     10       Married Machine-op-inspct
## 5  18       ? 103497 Some-college     10 Never-married                 ?
##   Relationship  Race Gender CapGain CapLoss WeekHours        Native Income
## 1    Own-child Black   Male       0       0        40 United-States  <=50K
## 2      Husband White   Male       0       0        50 United-States  <=50K
## 3      Husband White   Male       0       0        40 United-States   >50K
## 4      Husband Black   Male    7688       0        40 United-States   >50K
## 5    Own-child White Female       0       0        30 United-States  <=50K

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Adults, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##                           Variable     N    Mean     SD   Min    P25    P50
## 1                              Age 48842  38.644 13.711    17     28     37
## 2                            Class 48842                                   
## 3                            ... ?  2799  5.731%                           
## 4                          ... Gov  6549 13.409%                           
## 5                 ... Never-worked    10   0.02%                           
## 6                      ... Private 33906  69.42%                           
## 7                         ... Self  5557 11.378%                           
## 8                  ... Without-pay    21  0.043%                           
## 9                           Fnlwgt 48842  189664 105604 12285 117551 178145
## 10                          Educat 48842                                   
## 11                        ... 10th  1389  2.844%                           
## 12                        ... 11th  1812   3.71%                           
## 13                        ... 12th   657  1.345%                           
## 14                     ... 1st-4th   247  0.506%                           
## 15                     ... 5th-6th   509  1.042%                           
## 16                     ... 7th-8th   955  1.955%                           
## 17                         ... 9th   756  1.548%                           
## 18                  ... Assoc-acdm  1601  3.278%                           
## 19                   ... Assoc-voc  2061   4.22%                           
## 20                   ... Bachelors  8025 16.431%                           
## 21                   ... Doctorate   594  1.216%                           
## 22                     ... HS-grad 15784 32.316%                           
## 23                     ... Masters  2657   5.44%                           
## 24                   ... Preschool    83   0.17%                           
## 25                 ... Prof-school   834  1.708%                           
## 26                ... Some-college 10878 22.272%                           
## 27                          EduNum 48842  10.078  2.571     1      9     10
## 28                         Marital 48842                                   
## 29                    ... Divorced  6633 13.581%                           
## 30                     ... Married 23044 47.181%                           
## 31               ... Never-married 16117 32.998%                           
## 32                   ... Separated  1530  3.133%                           
## 33                     ... Widowed  1518  3.108%                           
## 34                         Occupat 48842                                   
## 35                           ... ?  2809  5.751%                           
## 36                ... Adm-clerical  5611 11.488%                           
## 37                ... Armed-Forces    15  0.031%                           
## 38                ... Craft-repair  6112 12.514%                           
## 39             ... Exec-managerial  6086 12.461%                           
## 40             ... Farming-fishing  1490  3.051%                           
## 41           ... Handlers-cleaners  2072  4.242%                           
## 42           ... Machine-op-inspct  3022  6.187%                           
## 43               ... Other-service  4923 10.079%                           
## 44             ... Priv-house-serv   242  0.495%                           
## 45              ... Prof-specialty  6172 12.637%                           
## 46             ... Protective-serv   983  2.013%                           
## 47                       ... Sales  5504 11.269%                           
## 48                ... Tech-support  1446  2.961%                           
## 49            ... Transport-moving  2355  4.822%                           
## 50                    Relationship 48842                                   
## 51                     ... Husband 19716 40.367%                           
## 52               ... Not-in-family 12583 25.763%                           
## 53              ... Other-relative  1506  3.083%                           
## 54                   ... Own-child  7581 15.521%                           
## 55                   ... Unmarried  5125 10.493%                           
## 56                        ... Wife  2331  4.773%                           
## 57                            Race 48842                                   
## 58          ... Amer-Indian-Eskimo   470  0.962%                           
## 59          ... Asian-Pac-Islander  1519   3.11%                           
## 60                       ... Black  4685  9.592%                           
## 61                       ... Other   406  0.831%                           
## 62                       ... White 41762 85.504%                           
## 63                          Gender 48842                                   
## 64                      ... Female 16192 33.152%                           
## 65                        ... Male 32650 66.848%                           
## 66                         CapGain 48842  1079.1   7452     0      0      0
## 67                         CapLoss 48842  87.502    403     0      0      0
## 68                       WeekHours 48842  40.422 12.391     1     40     40
## 69                          Native 48842                                   
## 70                           ... ?   857  1.755%                           
## 71                    ... Cambodia    28  0.057%                           
## 72                      ... Canada   182  0.373%                           
## 73                       ... China   122   0.25%                           
## 74                    ... Columbia    85  0.174%                           
## 75                        ... Cuba   138  0.283%                           
## 76          ... Dominican-Republic   103  0.211%                           
## 77                     ... Ecuador    45  0.092%                           
## 78                 ... El-Salvador   155  0.317%                           
## 79                     ... England   127   0.26%                           
## 80                      ... France    38  0.078%                           
## 81                     ... Germany   206  0.422%                           
## 82                      ... Greece    49    0.1%                           
## 83                   ... Guatemala    88   0.18%                           
## 84                       ... Haiti    75  0.154%                           
## 85          ... Holand-Netherlands     1  0.002%                           
## 86                    ... Honduras    20  0.041%                           
## 87                        ... Hong    30  0.061%                           
## 88                     ... Hungary    19  0.039%                           
## 89                       ... India   151  0.309%                           
## 90                        ... Iran    59  0.121%                           
## 91                     ... Ireland    37  0.076%                           
## 92                       ... Italy   105  0.215%                           
## 93                     ... Jamaica   106  0.217%                           
## 94                       ... Japan    92  0.188%                           
## 95                        ... Laos    23  0.047%                           
## 96                      ... Mexico   951  1.947%                           
## 97                   ... Nicaragua    49    0.1%                           
## 98  ... Outlying-US(Guam-USVI-etc)    23  0.047%                           
## 99                        ... Peru    46  0.094%                           
## 100                ... Philippines   295  0.604%                           
## 101                     ... Poland    87  0.178%                           
## 102                   ... Portugal    67  0.137%                           
## 103                ... Puerto-Rico   184  0.377%                           
## 104                   ... Scotland    21  0.043%                           
## 105                      ... South   115  0.235%                           
## 106                     ... Taiwan    65  0.133%                           
## 107                   ... Thailand    30  0.061%                           
## 108            ... Trinadad&Tobago    27  0.055%                           
## 109              ... United-States 43832 89.742%                           
## 110                    ... Vietnam    86  0.176%                           
## 111                 ... Yugoslavia    23  0.047%                           
## 112                         Income 48842                                   
## 113                      ... <=50K 37155 76.072%                           
## 114                       ... >50K 11687 23.928%                           
##        P75     Max NA Mode
## 1       48      90  0     
## 2                         
## 3                         
## 4                         
## 5                         
## 6                         
## 7                         
## 8                         
## 9   237642 1490400  0     
## 10                        
## 11                        
## 12                        
## 13                        
## 14                        
## 15                        
## 16                        
## 17                        
## 18                        
## 19                        
## 20                        
## 21                        
## 22                        
## 23                        
## 24                        
## 25                        
## 26                        
## 27      12      16  0     
## 28                        
## 29                        
## 30                        
## 31                        
## 32                        
## 33                        
## 34                        
## 35                        
## 36                        
## 37                        
## 38                        
## 39                        
## 40                        
## 41                        
## 42                        
## 43                        
## 44                        
## 45                        
## 46                        
## 47                        
## 48                        
## 49                        
## 50                        
## 51                        
## 52                        
## 53                        
## 54                        
## 55                        
## 56                        
## 57                        
## 58                        
## 59                        
## 60                        
## 61                        
## 62                        
## 63                        
## 64                        
## 65                        
## 66       0   99999  0     
## 67       0    4356  0     
## 68      45      99  0     
## 69                        
## 70                        
## 71                        
## 72                        
## 73                        
## 74                        
## 75                        
## 76                        
## 77                        
## 78                        
## 79                        
## 80                        
## 81                        
## 82                        
## 83                        
## 84                        
## 85                        
## 86                        
## 87                        
## 88                        
## 89                        
## 90                        
## 91                        
## 92                        
## 93                        
## 94                        
## 95                        
## 96                        
## 97                        
## 98                        
## 99                        
## 100                       
## 101                       
## 102                       
## 103                       
## 104                       
## 105                       
## 106                       
## 107                       
## 108                       
## 109                       
## 110                       
## 111                       
## 112                       
## 113                       
## 114

4.4 Standardise Numeric Variables

# Standardise Numeric Variables
Adults$Age.z       <- (Adults$Age - mean(Adults$Age))/sd(Adults$Age)
Adults$EduNum.z    <- (Adults$EduNum - mean(Adults$EduNum))/sd(Adults$EduNum)
Adults$CapGain.z   <- (Adults$CapGain - mean(Adults$CapGain))/sd(Adults$CapGain)
Adults$CapLoss.z   <- (Adults$CapLoss - mean(Adults$CapLoss))/sd(Adults$CapLoss)
Adults$WeekHours.z <- (Adults$WeekHours - mean(Adults$WeekHours))/sd(Adults$WeekHours)
names(Adults)

##  [1] "Age"          "Class"        "Fnlwgt"       "Educat"       "EduNum"      
##  [6] "Marital"      "Occupat"      "Relationship" "Race"         "Gender"      
## [11] "CapGain"      "CapLoss"      "WeekHours"    "Native"       "Income"      
## [16] "Age.z"        "EduNum.z"     "CapGain.z"    "CapLoss.z"    "WeekHours.z"

4.5 Use Predictors to Classify Whether or not a Person’s Income is less than $50K

# Calculate the Income Tree
IncomeTree <- rpart(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, method = "class", minsplit = 2, minbucket = 1)
print(IncomeTree)

## n= 48842 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 48842 11690 <=50K (0.76072 0.23928)  
##    2) Marital=Divorced,Never-married,Separated,Widowed 25798  1631 <=50K (0.93678 0.06322)  
##      4) CapGain.z< 0.802 25352  1200 <=50K (0.95267 0.04733) *
##      5) CapGain.z>=0.802 446    15 >50K (0.03363 0.96637) *
##    3) Marital=Married 23044 10060 <=50K (0.56362 0.43638)  
##      6) EduNum.z< 0.942 16234  5229 <=50K (0.67790 0.32210)  
##       12) CapGain.z< 0.539 15465  4475 <=50K (0.71064 0.28936)  
##         24) EduNum.z< -0.6138 2672   266 <=50K (0.90045 0.09955) *
##         25) EduNum.z>=-0.6138 12793  4209 <=50K (0.67099 0.32901)  
##           50) CapLoss.z< 4.3 12298  3833 <=50K (0.68832 0.31168) *
##           51) CapLoss.z>=4.3 495   119 >50K (0.24040 0.75960) *
##       13) CapGain.z>=0.539 769    15 >50K (0.01951 0.98049) *
##      7) EduNum.z>=0.942 6810  1983 >50K (0.29119 0.70881) *

4.6 Plot Decision Tree using Cart

# Plot the Decsion Tree
cat("\n\nThe following information is shown in each node:\n")

## 
## 
## The following information is shown in each node:

cat("- Node Number\n")

## - Node Number

cat("- Predicted class\n")

## - Predicted class

cat("- Predicted probability of this class\n")

## - Predicted probability of this class

cat("- Percentage of Observations in the node\n")

## - Percentage of Observations in the node

rpart.plot(IncomeTree, main = "Classification Tree: Is a Person Likely to Earn 50k?", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

5 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

5.1 Calculate Decision Tree Using Partykit and CTREE

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)

# Load Library if not available
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Calculate IncomeTree
IncomeTree <- ctree(Income ~ Age + EduNum + CapGain + CapLoss + WeekHours + Race + Gender + Class + Marital, data = Adults, weights = NULL,
  minsplit  = 12,
  maxdepth  =  3,
  teststat  = "quadratic",
  testtype  = "Bonferroni",
  minbucket = 10,
  mincriterion = 0.95,)

# Calculate IncomeTree from Normalised Values
IncomeTree.z <- ctree(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, weights = NULL,
  minsplit  = 12,
  maxdepth  =  3,
  teststat  = "quadratic",
  testtype  = "Bonferroni",
  minbucket = 10,
  mincriterion = 0.95,)

print(IncomeTree)

## 
## Model formula:
## Income ~ Age + EduNum + CapGain + CapLoss + WeekHours + Race + 
##     Gender + Class + Marital
## 
## Fitted party:
## [1] root
## |   [2] Marital in Divorced, Never-married, Separated, Widowed
## |   |   [3] CapGain <= 6849
## |   |   |   [4] EduNum <= 13: <=50K (n = 23975, err = 4%)
## |   |   |   [5] EduNum > 13: <=50K (n = 1377, err = 25%)
## |   |   [6] CapGain > 6849
## |   |   |   [7] WeekHours <= 35: >50K (n = 46, err = 20%)
## |   |   |   [8] WeekHours > 35: >50K (n = 400, err = 2%)
## |   [9] Marital in Married
## |   |   [10] EduNum <= 12
## |   |   |   [11] EduNum <= 8: <=50K (n = 2726, err = 12%)
## |   |   |   [12] EduNum > 8: <=50K (n = 13508, err = 36%)
## |   |   [13] EduNum > 12
## |   |   |   [14] WeekHours <= 31: <=50K (n = 567, err = 44%)
## |   |   |   [15] WeekHours > 31: >50K (n = 6243, err = 27%)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8

print(IncomeTree.z)

## 
## Model formula:
## Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + 
##     Race + Gender + Class + Marital
## 
## Fitted party:
## [1] root
## |   [2] Marital in Divorced, Never-married, Separated, Widowed
## |   |   [3] CapGain.z <= 0.77
## |   |   |   [4] EduNum.z <= 1.14: <=50K (n = 23975, err = 4%)
## |   |   |   [5] EduNum.z > 1.14: <=50K (n = 1377, err = 25%)
## |   |   [6] CapGain.z > 0.77
## |   |   |   [7] WeekHours.z <= -0.44: >50K (n = 46, err = 20%)
## |   |   |   [8] WeekHours.z > -0.44: >50K (n = 400, err = 2%)
## |   [9] Marital in Married
## |   |   [10] EduNum.z <= 0.75
## |   |   |   [11] EduNum.z <= -0.81: <=50K (n = 2726, err = 12%)
## |   |   |   [12] EduNum.z > -0.81: <=50K (n = 13508, err = 36%)
## |   |   [13] EduNum.z > 0.75
## |   |   |   [14] WeekHours.z <= -0.76: <=50K (n = 567, err = 44%)
## |   |   |   [15] WeekHours.z > -0.76: >50K (n = 6243, err = 27%)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8

5.2 Plot Decision Tree Using Partykit

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

p <- plot(IncomeTree, main="Classification Tree: Is a Person Likely to Earn 50k?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(IncomeTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(IncomeTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

# Save the plot
ggsave(filename = paste("Is a Person Likely to Earn 50k", ".png", sep = ""), plot = p, width = 12, height = 8)

6 Classification and Regression Tree Using Conditional Inference Tree (C5.0, C5.0 Method)

6.1 Calculate Decision Tree Using C5.0

# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)

# Make Variables
Outcome <- as.factor(Adults$Income)
Factors <- data.frame(Adults[, c(1, 2, 4, 6, 9, 10, 13, 14)])

# Calculate IncomeTree from Observed Values
IncomeTree <- C5.0(y = Outcome, x = Factors, rules = FALSE, control = C5.0Control(
  subset = FALSE,
  bands = 0,
  winnow = FALSE,
  noGlobalPruning = FALSE,
  CF = 0.05,
  minCases = 50,
  fuzzyThreshold = FALSE,
  seed = sample.int(4096, size = 1) - 1L,
  earlyStopping = TRUE,
  label = "outcome"))

# Calculate IncomeTree from Normalised Values
#IncomeTree.z <- C5.0(Income ~ Age.z + EduNum.z + CapGain.z + CapLoss.z + WeekHours.z + Race + Gender + Class + Marital, data = Adults, rules = FALSE)

summary(IncomeTree)

## 
## Call:
## C5.0.default(x = Factors, y = Outcome, rules = FALSE, control
##  = FALSE, CF = 0.05, minCases = 50, fuzzyThreshold = FALSE, seed
##  = sample.int(4096, size = 1) - 1L, earlyStopping = TRUE, label = "outcome"))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:14 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 48842 cases (9 attributes) from undefined.data
## 
## Decision tree:
## 
## Age <= 27: <=50K (12012/369)
## Age > 27:
## :...Marital in {Divorced,Never-married,Separated,
##     :           Widowed}: <=50K (15612/1529)
##     Marital = Married:
##     :...Educat in {10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-voc,
##         :          HS-grad,Preschool}: <=50K (10153/2939)
##         Educat in {Doctorate,Masters,Prof-school}: >50K (2539/530)
##         Educat = Assoc-acdm:
##         :...Age <= 35: <=50K (180/68)
##         :   Age > 35: >50K (482/206)
##         Educat = Bachelors:
##         :...WeekHours <= 31: <=50K (323/135)
##         :   WeekHours > 31: >50K (3682/1094)
##         Educat = Some-college:
##         :...WeekHours <= 34: <=50K (353/92)
##             WeekHours > 34:
##             :...Class in {?,Never-worked}: >50K (0)
##                 Class in {Self,Without-pay}: <=50K (615.9/271)
##                 Class = Gov:
##                 :...Age <= 38: <=50K (185/71.6)
##                 :   Age > 38: >50K (364.7/157.7)
##                 Class = Private:
##                 :...WeekHours > 41:
##                     :...Age <= 37: <=50K (348.3/139)
##                     :   Age > 37: >50K (608/235.3)
##                     WeekHours <= 41:
##                     :...Age > 61: <=50K (65/25)
##                         Age <= 61:
##                         :...Age <= 47: <=50K (982.7/403)
##                             Age > 47: >50K (336.3/143.7)
## 
## 
## Evaluation on training data (48842 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      17 8408(17.2%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##   34787  2368    (a): class <=50K
##    6040  5647    (b): class >50K
## 
## 
##  Attribute usage:
## 
##  100.00% Age
##   75.41% Marital
##   43.44% Educat
##   16.10% WeekHours
##    7.04% Class
## 
## 
## Time: 0.1 secs

#summary(IncomeTree.z)

6.2 Plot Decision Tree Using C5.0

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(IncomeTree, main="Classification Tree: Is a Person Likely to Earn 50k or more?", cex.main = 1.5, 
      gp = gpar(fontsize = 15),
      drop_terminal = FALSE, tnex = 1,
      )

6.3 Get Data to Determine Whether Customer Should be Classified as Credit Risk

# Input Data Set
RiskData <- data.frame(
  Customer = c(1, 2, 3, 4, 5, 6, 7, 8),
  Savings  = c("Medium", "Low", "High", "Medium", "Low", "High", "Low", "Medium"),
  Assets   = c("High", "Low", "Medium", "Medium", "Medium", "High", "Low", "Medium"),
  IncomeK  = c(75, 50, 25, 50, 100, 25, 25, 75),
  Risk     = c("Good", "Bad", "Bad", "Good","Good", "Good", "Bad", "Good")
)

# Display Data Table
RiskData

##   Customer Savings Assets IncomeK Risk
## 1        1  Medium   High      75 Good
## 2        2     Low    Low      50  Bad
## 3        3    High Medium      25  Bad
## 4        4  Medium Medium      50 Good
## 5        5     Low Medium     100 Good
## 6        6    High   High      25 Good
## 7        7     Low    Low      25  Bad
## 8        8  Medium Medium      75 Good

# Explore Data
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

sumtable(RiskData, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##      Variable N   Mean     SD Min  P25 P50  P75 Max NA Mode
## 1    Customer 8    4.5 2.4495   1 2.75 4.5 6.25   8  0     
## 2     Savings 8                                            
## 3    ... High 2    25%                                     
## 4     ... Low 3  37.5%                                     
## 5  ... Medium 3  37.5%                                     
## 6      Assets 8                                            
## 7    ... High 2    25%                                     
## 8     ... Low 2    25%                                     
## 9  ... Medium 4    50%                                     
## 10    IncomeK 8 53.125  28.15  25   25  50   75 100  0     
## 11       Risk 8                                            
## 12    ... Bad 3  37.5%                                     
## 13   ... Good 5  62.5%

# Write data to working directory
write.csv(RiskData, file = "RiskData.csv")

6.4 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

# Make Variables
Outcome <- as.factor(RiskData$Risk)
Factors <- data.frame(RiskData[, c(2, 3, 4)])

# Calculate IncomeTree from Observed Values
RiskTree <- C5.0(Outcome ~ Savings + Assets + IncomeK, data = RiskData, rules = FALSE)

summary(RiskTree)

## 
## Call:
## C5.0.formula(formula = Outcome ~ Savings + Assets + IncomeK, data =
##  RiskData, rules = FALSE)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:15 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 8 cases (4 attributes) from undefined.data
## 
## Decision tree:
## 
## Assets in {High,Medium}: Good (6/1)
## Assets = Low: Bad (2)
## 
## 
## Evaluation on training data (8 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    1(12.5%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##       2     1    (a): class Bad
##             5    (b): class Good
## 
## 
##  Attribute usage:
## 
##  100.00% Assets
## 
## 
## Time: 0.0 secs

6.5 Plot Decision Tree to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

# Plot the Decision Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(RiskTree, main="Classification Tree: Is a Person Likely to be Credit Risk?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      )

7 Assignment

7.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

# Load Library if not available
if(! "rpart" %in% installed.packages()) { install.packages("rpart", dependencies = TRUE) }
library(rpart)
if(! "rpart.plot" %in% installed.packages()) { install.packages("rpart.plot", dependencies = TRUE) }
library(rpart.plot)

# Set number of decimals
options(digits = 4)              # Modify global options

7.2 Get Data for Predicting Whether Going For Morning Walk

# Create the dataset
Exercise <- data.frame(
  Rain = c('No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes'),
  EarlyMeeting = c('No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes'),
  ExerciseEveningBefore = c('No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No'),
  GoForWalk = c('Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No')
)

##    Day Rain EarlyMeeting ExerciseEveningBefore GoForWalk
## 1    1   No           No                    No       Yes
## 2    2  Yes           No                    No        No
## 3    3  Yes          Yes                   Yes        No
## 4    4   No          Yes                   Yes        No
## 5    5   No          Yes                   Yes        No
## 6    6   No          Yes                   Yes        No
## 7    7   No           No                    No       Yes
## 8    8   No           No                   Yes       Yes
## 9    9  Yes          Yes                    No        No
## 10  10  Yes           No                    No        No
## 11  11  Yes           No                   Yes        No
## 12  12  Yes          Yes                    No        No
## 13  13   No           No                   Yes       Yes
## 14  14   No           No                   Yes       Yes
## 15  15   No          Yes                    No       Yes
## 16  16   No          Yes                    No       Yes

7.3 Calculate Decision Tree

# Calculate the Tree
control <- rpart.control(minsplit = 1, minbucket = 1, cp = 0.001, maxdepth = 30)
mytree <- rpart(
  GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore, data = Exercise,
  method = "class", 
  control = control
)
print(mytree)

## n= 16 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 16 7 No (0.5625 0.4375)  
##    2) Rain=Yes 6 0 No (1.0000 0.0000) *
##    3) Rain=No 10 3 Yes (0.3000 0.7000)  
##      6) EarlyMeeting=Yes 5 2 No (0.6000 0.4000)  
##       12) ExerciseEveningBefore=Yes 3 0 No (1.0000 0.0000) *
##       13) ExerciseEveningBefore=No 2 0 Yes (0.0000 1.0000) *
##      7) EarlyMeeting=No 5 0 Yes (0.0000 1.0000) *

# Plot the Tree
cat("\n\nThe following information is shown in each node:\n")

## 
## 
## The following information is shown in each node:

cat("- Node number\n")

## - Node number

cat("- Predicted class\n")

## - Predicted class

cat("- Predicted probability of this class\n")

## - Predicted probability of this class

cat("- Percentage of observations in the node\n")

## - Percentage of observations in the node

rpart.plot(mytree, main = "Classification Tree To Decide Going for Morning Walk", 
           under = FALSE,  
           type = 2,
           extra = 106,
           clip.right.labs = TRUE, shadow.col = "gray", # shadows under the node boxes
           nn = TRUE,
           fallen.leaves = TRUE, 
           digits = 3,
           box.palette = "RdGn"
           )

7.4 Calculate Decision Tree Using PartyKit

# Load Library if not available
if(! "partykit" %in% installed.packages()) { install.packages("partykit", dependencies = TRUE) }
library(partykit)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "ggparty" %in% installed.packages()) { install.packages("ggparty", dependencies = TRUE) }
library(ggparty)

# Calculate ExerciseTree
ExerciseTree <- ctree(GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore, data = Exercise, 
     minsplit = 1, 
     minbucket = 1)

ExerciseTree

## 
## Model formula:
## GoForWalk ~ Rain + EarlyMeeting + ExerciseEveningBefore
## 
## Fitted party:
## [1] root
## |   [2] Rain in No: Yes (n = 10, err = 30%)
## |   [3] Rain in Yes: No (n = 6, err = 0%)
## 
## Number of inner nodes:    1
## Number of terminal nodes: 2

8 Plot Decision Tree Using Partykit

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(ExerciseTree, main="Classification Tree: Are we going For a Walk This Morning?", cex.main = 1.5, 
      gp = gpar(fontsize = 11),
      drop_terminal = TRUE, tnex = 1,
      inner_panel = node_inner(ExerciseTree, abbreviate = FALSE,
                               fill = "lightgrey",
                               gp = gpar(),
                               pval = TRUE, 
                               id = TRUE),
      terminal_panel = node_barplot(ExerciseTree, col = "black", 
                               fill = columncol[c(2,1,4)], 
                               beside = TRUE,
                               ymax = 1,
                               ylines = TRUE,
                               widths = 1,
                               gap = 0.1,
                               reverse = FALSE, 
                               id = TRUE))

8.1 Calculate Decision Tree Using C5.0

# Load Library if not available
if(! "C50" %in% installed.packages()) { install.packages("C50", dependencies = TRUE) }
library(C50)

# Make Variables
Outcome <- as.factor(Exercise$GoForWalk)
Factors <- data.frame(
  Exercise$Rain, Exercise$EarlyMeeting, Exercise$ExerciseEveningBefore)

# Calculate ExerciseTree from Observed Values
ExerciseTree <- C5.0(y = Outcome, x = Factors, rules = FALSE, trials = 5, control = C5.0Control(
  subset = FALSE,
  bands = 0,
  winnow = FALSE,
  noGlobalPruning = FALSE,
  CF = 0.05,
  minCases = 1,
  fuzzyThreshold = FALSE,
  seed = sample.int(4096, size = 1) - 1L,
  earlyStopping = FALSE,
  label = "outcome"))

summary(ExerciseTree)

## 
## Call:
## C5.0.default(x = Factors, y = Outcome, trials = 5, rules = FALSE, control
##  = FALSE, CF = 0.05, minCases = 1, fuzzyThreshold = FALSE, seed
##  = sample.int(4096, size = 1) - 1L, earlyStopping = FALSE, label = "outcome"))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Jan 22 22:08:17 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 16 cases (4 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (6)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (5)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (2)
##         Exercise.ExerciseEveningBefore = Yes: No (3)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## Exercise.Rain = No: Yes (7.5/2.2)
## Exercise.Rain = Yes: No (4.5)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (4.8)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (4)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (1.6)
##         Exercise.ExerciseEveningBefore = Yes: No (5.5)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
##  No (12/4.2)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## Exercise.Rain = Yes: No (4.3)
## Exercise.Rain = No:
## :...Exercise.EarlyMeeting = No: Yes (4.9)
##     Exercise.EarlyMeeting = Yes:
##     :...Exercise.ExerciseEveningBefore = No: Yes (2)
##         Exercise.ExerciseEveningBefore = Yes: No (4.9)
## 
## 
## Evaluation on training data (16 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0      4    0( 0.0%)
##    1      2    3(18.8%)
##    2      4    0( 0.0%)
##    3      1    7(43.8%)
##    4      4    0( 0.0%)
## boost              0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##       9          (a): class No
##             7    (b): class Yes
## 
## 
##  Attribute usage:
## 
##  100.00% Exercise.Rain
##   62.50% Exercise.EarlyMeeting
##   31.25% Exercise.ExerciseEveningBefore
## 
## 
## Time: 0.0 secs

8.2 Plot Decision Tree Using C5.0

# Plot the Decsion Tree
columncol <- hcl(c(270, 260, 250), 200, 30, 0.6)
labelcol  <- hcl(200, 200, 50, 0.2)
indexcol  <- hcl(150, 200, 50, 0.4)

plot(ExerciseTree, main="Classification Tree: Are we Going for a Walk this Morning?", cex.main = 1.5, 
      gp = gpar(fontsize = 15),
      drop_terminal = FALSE, tnex = 1,
      )

9 Classification Using Random Forest

9.1 Get Data and Library

# Load Library if not available
if(! "randomForest" %in% installed.packages()) { install.packages("randomForest", dependencies = TRUE) }
library(randomForest)

# Load the iris dataset
data(iris)

# View the first few rows of the dataset
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

9.2 Split the Data

# Set seed for reproducibility
set.seed(123)

# Split the data into training (70%) and testing (30%) sets
train_index <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]

9.3 Train Random Forest Model

# Train the random forest model
rf_model <- randomForest(Species ~ ., data = train_data, importance = TRUE, ntree = 100)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = Species ~ ., data = train_data, importance = TRUE,      ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.76%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         36          0         0     0.00000
## versicolor      0         29         3     0.09375
## virginica       0          2        35     0.05405

9.4 Evaluate Model on Testing Data

# Make predictions on the testing set
predictions <- predict(rf_model, test_data)

# Create a confusion matrix
confusion_matrix <- table(predictions, test_data$Species)

# Print the confusion matrix
print(confusion_matrix)

##             
## predictions  setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         17         0
##   virginica       0          1        13

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 97.78 %"

9.5 Analyse Variable Importance

# View the importance of each variable
importance(rf_model)

##              setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
## Sepal.Length  2.485     4.1775     1.214                4.456            5.829
## Sepal.Width   1.725    -0.2708     3.803                3.246            1.586
## Petal.Length  9.533    13.3983    13.888               15.158           29.960
## Petal.Width  10.340    12.5554    13.980               15.020           31.708

# Plot variable importance
varImpPlot(rf_model)

Data Mining - Predictive ModellingClassification using Decision Trees

Updated by Uwe H Kaufmann on 22 Jan 2025, 22:07

1 Load Libraries

2 Classification and Regression Tree Using CART

2.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

2.2 Calculate Decision Tree

2.3 Get Data for Predicting Whether To Play Tennis

2.4 Use Predictors Whether To Play Tennis to Develop Decision Tree - Using Recursive Partitioning (RPart)

2.5 Get Data to Determine Whether Customer Should be Classified as Credit Risk

2.6 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk

3 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

3.1 Calculate Decision Tree Using Partykit

3.2 Plot Decision Tree to Play Tennis Using Partykit

3.3 Show Importance of Features

4 Classification and Regression Tree Using CART

4.1 Get Data for Predicting Whether Income is < 50k

4.2 Prepare Data

4.3 Explore Data Frame

4.4 Standardise Numeric Variables

4.5 Use Predictors to Classify Whether or not a Person’s Income is less than $50K

4.6 Plot Decision Tree using Cart

5 Classification and Regression Tree Using Conditional Inference Tree (CTREE, C4.5 Method)

5.1 Calculate Decision Tree Using Partykit and CTREE

5.2 Plot Decision Tree Using Partykit

6 Classification and Regression Tree Using Conditional Inference Tree (C5.0, C5.0 Method)

6.1 Calculate Decision Tree Using C5.0

6.2 Plot Decision Tree Using C5.0

6.3 Get Data to Determine Whether Customer Should be Classified as Credit Risk

6.4 Use Predictors to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

6.5 Plot Decision Tree to Determine Whether Customer Should be Classified as Credit Risk Using C5.0

7 Assignment

7.1 Get Data to Classify Whether or not a Car Insurance Claim was Fraudulent

7.2 Get Data for Predicting Whether Going For Morning Walk

7.3 Calculate Decision Tree

7.4 Calculate Decision Tree Using PartyKit

8 Plot Decision Tree Using Partykit

8.1 Calculate Decision Tree Using C5.0

8.2 Plot Decision Tree Using C5.0

9 Classification Using Random Forest

9.1 Get Data and Library

9.2 Split the Data

9.3 Train Random Forest Model

9.4 Evaluate Model on Testing Data

9.5 Analyse Variable Importance

Data Mining - Predictive Modelling
Classification using Decision Trees