1 Loading Libraries

# Loading necessary packages
## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

# Loading other packages if not available
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)

if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
#library(tidyr)

if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
#library(zoo)

if(! "car" %in% installed.packages()) { install.packages("car", dependencies = TRUE) }
#library(car)

# Global Settings
options(digits =   4)
options(scipen = 999)
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")

2 Regression for Disease Data

2.1 Getting Data for Disease

# Download Training Data from URL
# Download Test Data from URL

2.2 Exploring Data for Disease

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Add ID to Data Frame
Patients$ID <- 1:nrow(Patients)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Patients)

## [1] "PatientID" "Age"       "Disease"   "ID"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Patients)

## 'data.frame':    20 obs. of  4 variables:
##  $ PatientID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age      : int  25 29 30 31 32 41 41 42 44 49 ...
##  $ Disease  : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Patients, 5)

##   PatientID Age Disease ID
## 1         1  25       0  1
## 2         2  29       0  2
## 3         3  30       0  3
## 4         4  31       0  4
## 5         5  32       0  5

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Patients, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##    Variable  N  Mean      SD Min   P25  P50   P75 Max NA Mode
## 1 PatientID 20  10.5  5.9161   1  5.75 10.5 15.25  20  0     
## 2       Age 20 52.95  19.508  25 38.75 49.5    69  84  0     
## 3   Disease 20  0.35 0.48936   0     0    0     1   1  0     
## 4        ID 20  10.5  5.9161   1  5.75 10.5 15.25  20  0

2.3 Performing Linear Regression for Disease

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Run regression
LinReg1    <- lm(Disease ~ Age,  data = Patients)

summary(LinReg1)

## 
## Call:
## lm(formula = Disease ~ Age, data = Patients)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6992 -0.2345 -0.0602  0.2943  0.7655 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.33357    0.28521   -1.17     0.26  
## Age          0.01291    0.00507    2.55     0.02 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.431 on 18 degrees of freedom
## Multiple R-squared:  0.265,  Adjusted R-squared:  0.224 
## F-statistic: 6.49 on 1 and 18 DF,  p-value: 0.0202

anova(LinReg1)

## Analysis of Variance Table
## 
## Response: Disease
##           Df Sum Sq Mean Sq F value Pr(>F)  
## Age        1   1.21   1.205    6.49   0.02 *
## Residuals 18   3.34   0.186                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Calculate Predictions
Patients$PredictLm <- predict(LinReg1)

# Calculate RMSE
RMSELinReg1 <- sqrt(mean((Patients$Disease - Patients$PredictLm)^2))

cat("\n\nRMSE of Linear Regression: ", sprintf(RMSELinReg1, fmt = '%#.4f'), "\n")

## 
## 
## RMSE of Linear Regression:  0.4090

2.4 Performing Logistic Regression for Disease

# Run Regression
LogReg1 <- glm(Disease ~ Age, data = Patients, family = binomial(link = 'logit'))

summary(LogReg1)

## 
## Call:
## glm(formula = Disease ~ Age, family = binomial(link = "logit"), 
##     data = Patients)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -4.3721     1.9656   -2.22    0.026 *
## Age           0.0670     0.0322    2.08    0.038 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25.898  on 19  degrees of freedom
## Residual deviance: 20.201  on 18  degrees of freedom
## AIC: 24.2
## 
## Number of Fisher Scoring iterations: 4

anova(LogReg1)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Disease
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                    19       25.9           
## Age   1      5.7        18       20.2    0.017 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)

# Calculate McFadden’s R2
pR2(LogReg1)

## fitting null model for pseudo-r2

##      llh  llhNull       G2 McFadden     r2ML     r2CU 
## -10.1007 -12.9489   5.6964   0.2200   0.2479   0.3414

# Calculate Predictions
Patients$PredictLogit <- predict(LogReg1)

# Calculate RMSE
RMSELogReg1 <- sqrt(mean((Patients$Disease - Patients$PredictLogit)^2))

cat("\n\nRMSE of Logistic Regression: ", sprintf(RMSELogReg1, fmt = '%#.4f'), "\n")

## 
## 
## RMSE of Logistic Regression:  1.6150

2.5 Comparing Models for Disease

# Produce AIC table
AIClin <- AIC(LinReg1)
AIClog <- AIC(LogReg1)
cat("\n\nAIC of Linear Regression: ", sprintf(AIClin, fmt = '%#.4f'), "\n")

## 
## 
## AIC of Linear Regression:  26.9915

cat("AIC of Logistic Regression: ", sprintf(AIClog, fmt = '%#.4f'), "\n")

## AIC of Logistic Regression:  24.2015

# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "broom" %in% installed.packages()) { install.packages("broom", dependencies = TRUE) }
library(broom)


glance(LinReg1) %>%
  dplyr::select(adj.r.squared, AIC, BIC, p.value)

## # A tibble: 1 × 4
##   adj.r.squared   AIC   BIC p.value
##           <dbl> <dbl> <dbl>   <dbl>
## 1         0.224  27.0  30.0  0.0202

glance(LogReg1) %>%
  dplyr::select(AIC, BIC)

## # A tibble: 1 × 2
##     AIC   BIC
##   <dbl> <dbl>
## 1  24.2  26.2

2.6 Showing Scatter Plot for Disease

# Show Scatter Plot
plot(Patients$Age, Patients$Disease,
xlab = "Age", ylab = "Disease", cex.lab = 1.2,
main = "Disease vs. Age", cex.main = 1.6, 
xlim = c(20, 90), pch = 16)
abline(LinReg1, lty = 3, col = "red")

# Add Courve to Plot
curve(predict(LogReg1, data.frame(Age = x),
              type = "resp"), add = TRUE, lwd = 2, col = "blue")
legend("topleft", legend = c("Lin", "Log"), col = c("red", "blue"), lty = c(3, 1), cex = 1.2)

2.7 Showing Plot with CI for Disease

# Load Library if not available
if(! "sjPlot" %in% installed.packages()) { install.packages("sjPlot", dependencies = TRUE) }
library(sjPlot)

# Plot Probability with CI
plot_model(LogReg1, type = "pred", axis.lim = c(0, 1))

2.8 Showing McFadden’s Rsquare for Logistic Regression of Disease

# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)

pR2(LogReg1)

## fitting null model for pseudo-r2

##      llh  llhNull       G2 McFadden     r2ML     r2CU 
## -10.1007 -12.9489   5.6964   0.2200   0.2479   0.3414

2.9 Calculating Odds Ratio and 95% CI and Predict for Disease

# Calculate Odds Ratio and 95% CI
exp(cbind(OR = coef(LogReg1), confint(LogReg1)))

##                  OR     2.5 % 97.5 %
## (Intercept) 0.01262 0.0001111 0.3458
## Age         1.06925 1.0110685 1.1534

# Predict Chance to Have Disease For Test Data
PatientsTest$Prob <- predict(LogReg1, newdata = PatientsTest, type = "response")

cat("\n\nThe Probability of Catching Disease for New Data is Shown in Prob:\n")

## 
## 
## The Probability of Catching Disease for New Data is Shown in Prob:

PatientsTest

##    PatientID Age Disease    Prob
## 1          1  26       0 0.06715
## 2          2  31       0 0.09142
## 3          3  31       1 0.09142
## 4          4  32       0 0.09713
## 5          5  35       0 0.11623
## 6          6  41       0 0.16426
## 7          7  41       1 0.16426
## 8          8  42       0 0.17365
## 9          9  82       1 0.75367
## 10        10  83       1 0.76589

Disease50 <- exp(LogReg1$coefficients[1] + LogReg1$coefficients[2] * 50) / (1 + exp(LogReg1$coefficients[1] + LogReg1$coefficients[2] * 50))
cat("\n\nThe Probability of a 50-Year-old Patient Catching the Disease is:", Disease50 * 100, "%.\n")

## 
## 
## The Probability of a 50-Year-old Patient Catching the Disease is: 26.42 %.

2.10 Calculating Deviance

# Run Regression for Null Model - Only Intercept
LogReg1Null <- glm(Disease ~ 1, data = Patients, family = binomial(link = 'logit'))

summary(LogReg1Null)

## 
## Call:
## glm(formula = Disease ~ 1, family = binomial(link = "logit"), 
##     data = Patients)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -0.619      0.469   -1.32     0.19
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25.898  on 19  degrees of freedom
## Residual deviance: 25.898  on 19  degrees of freedom
## AIC: 27.9
## 
## Number of Fisher Scoring iterations: 4

anova(LogReg1Null)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Disease
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                    19       25.9

# Compute Rsq 
r2log <- 1 - LogReg1$deviance / LogReg1$null.deviance

cat("\n\nThe R2 of this Logistic Regression Model is:", r2log * 100, "%.\n")

## 
## 
## The R2 of this Logistic Regression Model is: 22 %.

3 Logistic Regression with TelcoChurn

3.1 Getting Data for TelcoChurn (VMchurn)

# Download Training Data from URL

3.2 Exploring Data for TelcoChurn

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Add ID to Data Frame
VMchurn$ID <- 1:nrow(VMchurn)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(VMchurn)

##  [1] "CustomerID"           "Gender"               "SeniorCitizen"       
##  [4] "MaritalStatus"        "Dependents"           "Tenure"              
##  [7] "PhoneService"         "MultipleLines"        "InternetService"     
## [10] "OnlineSecurity"       "OnlineBackup"         "DeviceProtection"    
## [13] "TechSupport"          "StreamingTV"          "StreamingMovies"     
## [16] "Contract"             "PaperlessBilling"     "PaymentMethod"       
## [19] "InternationalPlan"    "VoiceMailPlan"        "NumbervMailMessages" 
## [22] "TotalDayMinutes"      "TotalDayCalls"        "TotalEveMinutes"     
## [25] "TotalEveCalls"        "TotalNightMinutes"    "TotalNightCalls"     
## [28] "TotalIntlMinutes"     "TotalIntlCalls"       "CustomerServiceCalls"
## [31] "TotalCall"            "TotalRevenue"         "Churn"               
## [34] "ID"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(VMchurn)

## 'data.frame':    3333 obs. of  34 variables:
##  $ CustomerID          : Factor w/ 3333 levels "0002-ORFBO","0004-TLHLJ",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender              : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 2 1 1 ...
##  $ SeniorCitizen       : int  0 0 0 1 1 0 1 1 0 0 ...
##  $ MaritalStatus       : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 1 1 2 ...
##  $ Dependents          : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 2 ...
##  $ Tenure              : int  9 4 9 71 7 5 1 45 3 4 ...
##  $ PhoneService        : Factor w/ 1 level "Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ MultipleLines       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ InternetService     : Factor w/ 3 levels "DSL","Fiber optic",..: 1 2 1 2 1 2 2 1 3 3 ...
##  $ OnlineSecurity      : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 3 1 1 3 2 2 ...
##  $ OnlineBackup        : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 1 1 2 2 ...
##  $ DeviceProtection    : Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 1 1 3 2 2 ...
##  $ TechSupport         : Factor w/ 3 levels "No","No internet service",..: 3 1 3 3 1 1 1 1 2 2 ...
##  $ StreamingTV         : Factor w/ 3 levels "No","No internet service",..: 3 1 3 3 1 1 1 1 2 2 ...
##  $ StreamingMovies     : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 1 1 1 3 2 2 ...
##  $ Contract            : Factor w/ 3 levels "Month-to-month",..: 2 1 1 3 1 1 1 2 1 1 ...
##  $ PaperlessBilling    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 1 1 ...
##  $ PaymentMethod       : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 3 2 1 3 3 3 2 4 4 ...
##  $ InternationalPlan   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 1 ...
##  $ VoiceMailPlan       : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ NumbervMailMessages : int  0 0 36 0 0 0 0 0 0 0 ...
##  $ TotalDayMinutes     : num  168.8 122.2 178.7 190.2 67.7 ...
##  $ TotalDayCalls       : int  137 112 134 68 68 95 55 133 158 99 ...
##  $ TotalEveMinutes     : num  241 132 179 262 196 ...
##  $ TotalEveCalls       : int  107 94 102 64 86 128 124 86 120 93 ...
##  $ TotalNightMinutes   : num  205 170 127 130 236 ...
##  $ TotalNightCalls     : int  106 106 82 92 137 105 81 80 46 106 ...
##  $ TotalIntlMinutes    : num  15.5 10.3 8 8.8 12 12.9 10 11.5 12.4 8 ...
##  $ TotalIntlCalls      : int  4 9 4 4 2 5 7 3 3 4 ...
##  $ CustomerServiceCalls: int  0 5 2 0 1 3 3 0 1 1 ...
##  $ TotalCall           : int  354 326 324 228 294 336 270 302 328 303 ...
##  $ TotalRevenue        : num  593 281 572 7904 340 ...
##  $ Churn               : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 1 2 ...
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(VMchurn, 5)

##   CustomerID Gender SeniorCitizen MaritalStatus Dependents Tenure PhoneService
## 1 0002-ORFBO Female             0           Yes        Yes      9          Yes
## 2 0004-TLHLJ   Male             0            No         No      4          Yes
## 3 0013-MHZWF Female             0            No        Yes      9          Yes
## 4 0013-SMEOE Female             1           Yes         No     71          Yes
## 5 0015-UOCOJ Female             1            No         No      7          Yes
##   MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1            No             DSL             No          Yes               No
## 2            No     Fiber optic             No           No              Yes
## 3            No             DSL             No           No               No
## 4            No     Fiber optic            Yes          Yes              Yes
## 5            No             DSL            Yes           No               No
##   TechSupport StreamingTV StreamingMovies       Contract PaperlessBilling
## 1         Yes         Yes              No       One year              Yes
## 2          No          No              No Month-to-month              Yes
## 3         Yes         Yes             Yes Month-to-month              Yes
## 4         Yes         Yes             Yes       Two year              Yes
## 5          No          No              No Month-to-month              Yes
##               PaymentMethod InternationalPlan VoiceMailPlan NumbervMailMessages
## 1              Mailed check                No            No                   0
## 2          Electronic check               Yes            No                   0
## 3   Credit card (automatic)                No           Yes                  36
## 4 Bank transfer (automatic)                No            No                   0
## 5          Electronic check                No            No                   0
##   TotalDayMinutes TotalDayCalls TotalEveMinutes TotalEveCalls TotalNightMinutes
## 1           168.8           137           241.4           107             204.8
## 2           122.2           112           131.7            94             169.5
## 3           178.7           134           178.6           102             126.8
## 4           190.2            68           262.2            64             130.0
## 5            67.7            68           195.7            86             236.5
##   TotalNightCalls TotalIntlMinutes TotalIntlCalls CustomerServiceCalls
## 1             106             15.5              4                    0
## 2             106             10.3              9                    5
## 3              82              8.0              4                    2
## 4              92              8.8              4                    0
## 5             137             12.0              2                    1
##   TotalCall TotalRevenue Churn ID
## 1       354        593.3    No  1
## 2       326        280.9   Yes  2
## 3       324        571.5    No  3
## 4       228       7904.3    No  4
## 5       294        340.4    No  5

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(VMchurn[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##                         Variable    N   Mean     SD  Min   P25   P50   P75
## 1                         Gender 3333                                     
## 2                     ... Female 1621 48.63%                              
## 3                       ... Male 1712 51.37%                              
## 4                  SeniorCitizen 3333 0.1101 0.3131    0     0     0     0
## 5                  MaritalStatus 3333                                     
## 6                         ... No 1831 54.94%                              
## 7                        ... Yes 1502 45.06%                              
## 8                     Dependents 3333                                     
## 9                         ... No 2237 67.12%                              
## 10                       ... Yes 1096 32.88%                              
## 11                        Tenure 3333     28  23.26    0     7    23    48
## 12                  PhoneService 3333                                     
## 13                       ... Yes 3333   100%                              
## 14                 MultipleLines 3333                                     
## 15                        ... No 3024 90.73%                              
## 16                       ... Yes  309  9.27%                              
## 17               InternetService 3333                                     
## 18                       ... DSL 1036 31.08%                              
## 19               ... Fiber optic 1118 33.54%                              
## 20                        ... No 1179 35.37%                              
## 21                OnlineSecurity 3333                                     
## 22                        ... No 1356 40.68%                              
## 23       ... No internet service 1179 35.37%                              
## 24                       ... Yes  798 23.94%                              
## 25                  OnlineBackup 3333                                     
## 26                        ... No 1289 38.67%                              
## 27       ... No internet service 1179 35.37%                              
## 28                       ... Yes  865 25.95%                              
## 29              DeviceProtection 3333                                     
## 30                        ... No 1320  39.6%                              
## 31       ... No internet service 1179 35.37%                              
## 32                       ... Yes  834 25.02%                              
## 33                   TechSupport 3333                                     
## 34                        ... No 1353 40.59%                              
## 35       ... No internet service 1179 35.37%                              
## 36                       ... Yes  801 24.03%                              
## 37                   StreamingTV 3333                                     
## 38                        ... No 1266 37.98%                              
## 39       ... No internet service 1179 35.37%                              
## 40                       ... Yes  888 26.64%                              
## 41               StreamingMovies 3333                                     
## 42                        ... No 1242 37.26%                              
## 43       ... No internet service 1179 35.37%                              
## 44                       ... Yes  912 27.36%                              
## 45                      Contract 3333                                     
## 46            ... Month-to-month 1790 53.71%                              
## 47                  ... One year  762 22.86%                              
## 48                  ... Two year  781 23.43%                              
## 49              PaperlessBilling 3333                                     
## 50                        ... No 1638 49.14%                              
## 51                       ... Yes 1695 50.86%                              
## 52                 PaymentMethod 3333                                     
## 53 ... Bank transfer (automatic)  694 20.82%                              
## 54   ... Credit card (automatic)  704 21.12%                              
## 55          ... Electronic check  887 26.61%                              
## 56              ... Mailed check 1048 31.44%                              
## 57             InternationalPlan 3333                                     
## 58                        ... No 3010 90.31%                              
## 59                       ... Yes  323  9.69%                              
## 60                 VoiceMailPlan 3333                                     
## 61                        ... No 2411 72.34%                              
## 62                       ... Yes  922 27.66%                              
## 63           NumbervMailMessages 3333  8.099  13.69    0     0     0    20
## 64               TotalDayMinutes 3333  179.8  54.47    0 143.7 179.4 216.4
## 65                 TotalDayCalls 3333  100.4  20.07    0    87   101   114
## 66               TotalEveMinutes 3333    201  50.71    0 166.6 201.4 235.3
## 67                 TotalEveCalls 3333  100.1  19.92    0    87   100   114
## 68             TotalNightMinutes 3333  200.9  50.57 23.2   167 201.2 235.3
## 69               TotalNightCalls 3333  100.1  19.57   33    87   100   113
## 70              TotalIntlMinutes 3333  10.24  2.792    0   8.5  10.3  12.1
## 71                TotalIntlCalls 3333  4.479  2.461    0     3     4     6
## 72          CustomerServiceCalls 3333  1.563  1.315    0     1     1     2
## 73                     TotalCall 3333  306.7  34.45  194   284   307   330
## 74                  TotalRevenue 3328   1673   1920 18.8 252.6 892.5  2434
## 75                         Churn 3333                                     
## 76                        ... No 2850 85.51%                              
## 77                       ... Yes  483 14.49%                              
## 78                            ID 3333   1667  962.3    1   834  1667  2500
##      Max                 NA Mode
## 1                               
## 2                               
## 3                               
## 4      1                  0     
## 5                               
## 6                               
## 7                               
## 8                               
## 9                               
## 10                              
## 11    72                  0     
## 12                              
## 13                              
## 14                              
## 15                              
## 16                              
## 17                              
## 18                              
## 19                              
## 20                              
## 21                              
## 22                              
## 23                              
## 24                              
## 25                              
## 26                              
## 27                              
## 28                              
## 29                              
## 30                              
## 31                              
## 32                              
## 33                              
## 34                              
## 35                              
## 36                              
## 37                              
## 38                              
## 39                              
## 40                              
## 41                              
## 42                              
## 43                              
## 44                              
## 45                              
## 46                              
## 47                              
## 48                              
## 49                              
## 50                              
## 51                              
## 52                              
## 53                              
## 54                              
## 55                              
## 56                              
## 57                              
## 58                              
## 59                              
## 60                              
## 61                              
## 62                              
## 63    51                  0     
## 64 350.8                  0     
## 65   165                  0     
## 66 363.7                  0     
## 67   170                  0     
## 68   395                  0     
## 69   175                  0     
## 70    20                  0     
## 71    20                  0     
## 72     9                  0     
## 73   418                  0     
## 74  8476 0.0015001500150015     
## 75                              
## 76                              
## 77                              
## 78  3333                  0

3.3 Splitting Data

# Loading other packages if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)

# Convert X and Y in Numeric Columns
VMchurn$ChurnNum  <- ifelse(VMchurn$Churn == 'No', 0, 1)
VMchurn$VMplanNum <- ifelse(VMchurn$VoiceMailPlan == 'No', 0, 1)
VMchurn$CSChig    <- ifelse(VMchurn$CustomerServiceCalls > 3, 1, 0)
VMchurn$CSCmed    <- ifelse(VMchurn$CustomerServiceCalls > 1 & VMchurn$CustomerServiceCalls < 4, 1, 0)

# Split data into train and test
set.seed(1)
split <- initial_split(VMchurn, prop = 0.80)
VMtrain <- training(split)
VMtest  <- testing(split)

# Show Split Data
cat("\n\nShow Structure of the Training Data Frame:\n")

## 
## 
## Show Structure of the Training Data Frame:

cat("Training Data Frame includes ", nrow(VMtrain), " rows and ", ncol(VMtrain), " columns.")

## Training Data Frame includes  2666  rows and  38  columns.

cat("\n\nShow Structure of the Testing Data Frame:\n")

## 
## 
## Show Structure of the Testing Data Frame:

cat("Testing Data Frame includes ", nrow(VMtest), " rows and ", ncol(VMtest), " columns.")

## Testing Data Frame includes  667  rows and  38  columns.

3.4 Performing Regression

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

LinReg2 <- lm(ChurnNum ~ VMplanNum, data = VMtrain)
LogReg2 <- glm(ChurnNum ~ VMplanNum, data = VMtrain, family = "binomial")

summary(LogReg2)

## 
## Call:
## glm(formula = ChurnNum ~ VMplanNum, family = "binomial", data = VMtrain)
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)   -1.593      0.061  -26.10 < 0.0000000000000002 ***
## VMplanNum     -0.768      0.143   -5.35          0.000000086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2212.2  on 2665  degrees of freedom
## Residual deviance: 2179.7  on 2664  degrees of freedom
## AIC: 2184
## 
## Number of Fisher Scoring iterations: 5

anova(LogReg2)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: ChurnNum
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev    Pr(>Chi)    
## NULL                       2665       2212                
## VMplanNum  1     32.5      2664       2180 0.000000012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.5 Performing Regression with Multiple Xs

# Run Regression with all Predictors
LogReg2a <- glm(ChurnNum ~ VMplanNum + Tenure + TotalDayMinutes + TotalEveMinutes + TotalNightMinutes + TotalIntlMinutes + CSCmed + CSChig, data = VMtrain, family = "binomial")

summary(LogReg2a, digits = 5)

## 
## Call:
## glm(formula = ChurnNum ~ VMplanNum + Tenure + TotalDayMinutes + 
##     TotalEveMinutes + TotalNightMinutes + TotalIntlMinutes + 
##     CSCmed + CSChig, family = "binomial", data = VMtrain)
## 
## Coefficients:
##                   Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)       -6.08378    0.57527  -10.58 < 0.0000000000000002 ***
## VMplanNum         -0.91883    0.16624   -5.53          0.000000033 ***
## Tenure            -0.05761    0.00451  -12.76 < 0.0000000000000002 ***
## TotalDayMinutes    0.01413    0.00129   10.95 < 0.0000000000000002 ***
## TotalEveMinutes    0.00605    0.00132    4.59          0.000004340 ***
## TotalNightMinutes  0.00181    0.00129    1.40              0.16199    
## TotalIntlMinutes   0.08746    0.02407    3.63              0.00028 ***
## CSCmed             0.01496    0.14591    0.10              0.91832    
## CSChig             2.46057    0.19653   12.52 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2212.2  on 2665  degrees of freedom
## Residual deviance: 1552.9  on 2657  degrees of freedom
## AIC: 1571
## 
## Number of Fisher Scoring iterations: 6

anova(LogReg2a)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: ChurnNum
## 
## Terms added sequentially (first to last)
## 
## 
##                   Df Deviance Resid. Df Resid. Dev             Pr(>Chi)    
## NULL                               2665       2212                         
## VMplanNum          1       32      2664       2180          0.000000012 ***
## Tenure             1      316      2663       1863 < 0.0000000000000002 ***
## TotalDayMinutes    1      109      2662       1754 < 0.0000000000000002 ***
## TotalEveMinutes    1       14      2661       1741              0.00020 ***
## TotalNightMinutes  1        1      2660       1740              0.32080    
## TotalIntlMinutes   1       12      2659       1728              0.00069 ***
## CSCmed             1       12      2658       1716              0.00044 ***
## CSChig             1      163      2657       1553 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.6 Performing Regression with Multiple Xs

# Run Regression with all Predictors
LogReg3 <- glm(ChurnNum ~ VMplanNum + Tenure + TotalDayMinutes + TotalEveMinutes + TotalNightMinutes + TotalIntlMinutes + CSChig, data = VMtrain, family = "binomial")

summary(LogReg3, digits = 5)

## 
## Call:
## glm(formula = ChurnNum ~ VMplanNum + Tenure + TotalDayMinutes + 
##     TotalEveMinutes + TotalNightMinutes + TotalIntlMinutes + 
##     CSChig, family = "binomial", data = VMtrain)
## 
## Coefficients:
##                   Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)       -6.07321    0.56580  -10.73 < 0.0000000000000002 ***
## VMplanNum         -0.91925    0.16619   -5.53          0.000000032 ***
## Tenure            -0.05762    0.00451  -12.76 < 0.0000000000000002 ***
## TotalDayMinutes    0.01412    0.00129   10.96 < 0.0000000000000002 ***
## TotalEveMinutes    0.00605    0.00132    4.60          0.000004285 ***
## TotalNightMinutes  0.00180    0.00129    1.40              0.16294    
## TotalIntlMinutes   0.08742    0.02406    3.63              0.00028 ***
## CSChig             2.45448    0.18726   13.11 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2212.2  on 2665  degrees of freedom
## Residual deviance: 1552.9  on 2658  degrees of freedom
## AIC: 1569
## 
## Number of Fisher Scoring iterations: 6

anova(LogReg3)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: ChurnNum
## 
## Terms added sequentially (first to last)
## 
## 
##                   Df Deviance Resid. Df Resid. Dev             Pr(>Chi)    
## NULL                               2665       2212                         
## VMplanNum          1       32      2664       2180          0.000000012 ***
## Tenure             1      316      2663       1863 < 0.0000000000000002 ***
## TotalDayMinutes    1      109      2662       1754 < 0.0000000000000002 ***
## TotalEveMinutes    1       14      2661       1741              0.00020 ***
## TotalNightMinutes  1        1      2660       1740              0.32080    
## TotalIntlMinutes   1       12      2659       1728              0.00069 ***
## CSChig             1      175      2658       1553 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.7 Showing McFadden’s Rsquare

# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)

pR2(LogReg3)

## fitting null model for pseudo-r2

##        llh    llhNull         G2   McFadden       r2ML       r2CU 
##  -776.4689 -1106.0906   659.2435     0.2980     0.2191     0.3885

4 Logistic Regression with Space Shuttle Data

4.1 Getting Data

# Download Training Data from URL

4.2 Exploring Data for Shuttle

# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

# Add ID to Data Frame
Shuttle$ID <- 1:nrow(Shuttle)

# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")

## 
## 
## Columns Available in Data Frame:

names(Shuttle)

## [1] "Flights"       "Date"          "nfails.field"  "nfails.nozzle"
## [5] "fail.field"    "fail.nozzle"   "TempC"         "TotalFails"   
## [9] "ID"

cat("\n\nShow Structure of the Data Frame:\n")

## 
## 
## Show Structure of the Data Frame:

str(Shuttle)

## 'data.frame':    23 obs. of  9 variables:
##  $ Flights      : Factor w/ 23 levels "1","2","3","41-B",..: 1 2 3 8 17 21 22 23 4 5 ...
##  $ Date         : Factor w/ 23 levels "03 Feb 1984",..: 8 11 14 7 3 13 21 18 1 5 ...
##  $ nfails.field : int  0 1 0 0 0 0 0 0 1 1 ...
##  $ nfails.nozzle: int  0 0 0 0 2 0 0 0 1 1 ...
##  $ fail.field   : int  0 1 0 0 0 0 0 0 1 1 ...
##  $ fail.nozzle  : int  0 0 0 0 1 0 0 0 1 1 ...
##  $ TempC        : num  18.9 21.1 20.6 20 19.4 22.2 22.8 21.1 13.9 17.2 ...
##  $ TotalFails   : int  0 1 0 0 3 0 0 0 3 3 ...
##  $ ID           : int  1 2 3 4 5 6 7 8 9 10 ...

cat("\n\nFirst 5 Rows of Data Frame:\n")

## 
## 
## First 5 Rows of Data Frame:

head(Shuttle, 5)

##   Flights        Date nfails.field nfails.nozzle fail.field fail.nozzle TempC
## 1       1 12 Apr 1981            0             0          0           0  18.9
## 2       2 12 Nov 1981            1             0          1           0  21.1
## 3       3 22 Mar 1982            0             0          0           0  20.6
## 4       5 11 Nov 1982            0             0          0           0  20.0
## 5       6 04 Apr 1983            0             2          0           1  19.4
##   TotalFails ID
## 1          0  1
## 2          1  2
## 3          0  3
## 4          0  4
## 5          3  5

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")

## 
## 
## Descriptive Statistics of Columns in Data Frame:

st(Shuttle[, -c(1, 2)], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)

##        Variable  N    Mean      SD  Min  P25  P50  P75  Max NA Mode
## 1  nfails.field 23  0.3913 0.65638    0    0    0    1    2  0     
## 2 nfails.nozzle 23 0.73913 0.91539    0    0    0    2    2  0     
## 3    fail.field 23 0.30435 0.47047    0    0    0    1    1  0     
## 4   fail.nozzle 23 0.43478 0.50687    0    0    0    1    1  0     
## 5         TempC 23  20.861  3.9195 11.7 19.4 21.1 23.9 27.2  0     
## 6    TotalFails 23  1.5652   1.674    0    0    1    3    5  0     
## 7            ID 23      12  6.7823    1  6.5   12 17.5   23  0

4.3 Calculating Confidence Interval for Pearson’s Correlation Coefficient

# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)

# Calculate Correlation Coefficient
ci_cor(
  x = Shuttle$TempC,
  y = Shuttle$TotalFails,
  probs = c(0.025, 0.975),
  method = "pearson"
)

## 
##  Two-sided 95% normal confidence interval for the true Pearson
##  correlation coefficient
## 
## Sample estimate: -0.5874 
## Confidence interval:
##    2.5%   97.5% 
## -0.8048 -0.2312

4.4 Performing Linear Regression

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Run regression
LinReg3    <- lm(TotalFails ~ TempC,  data = Shuttle)

summary(LinReg3)

## 
## Call:
## lm(formula = TotalFails ~ TempC, data = Shuttle)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.057 -1.154 -0.251  1.102  2.323 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.7988     1.5998    4.25  0.00036 ***
## TempC        -0.2509     0.0754   -3.33  0.00321 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.39 on 21 degrees of freedom
## Multiple R-squared:  0.345,  Adjusted R-squared:  0.314 
## F-statistic: 11.1 on 1 and 21 DF,  p-value: 0.00321

anova(LinReg3)

## Analysis of Variance Table
## 
## Response: TotalFails
##           Df Sum Sq Mean Sq F value Pr(>F)   
## TempC      1   21.3   21.27    11.1 0.0032 **
## Residuals 21   40.4    1.92                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Calculate Predictions
Shuttle$PredictLm <- predict(LinReg3)

# Calculate RMSE
RMSELinReg3 <- sqrt(mean((Shuttle$TotalFails - Shuttle$PredictLm)^2))

cat("\n\nRMSE of Linear Regression: ", sprintf(RMSELinReg3, fmt = '%#.4f'), "\n")

## 
## 
## RMSE of Linear Regression:  1.3250

4.5 Plotting Data with Regression Line

# Plot Data with Regression Line
plot(Shuttle$TempC, Shuttle$TotalFails, 
     main = "Space Shuttle: Scatter Plot of Outside Temperature vs Total Fails", 
     xlab = "Outside Temperature", 
     ylab = "Number of Fails", 
     pch = 16, 
     col = "blue") 

abline(LinReg3, col = "red")
grid(col = "lightgrey", lty = "dotted")

4.6 Plotting Confidence and Prediction Interval

# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)

# 95% quantile, two tailed
rq_low  <- rq(Shuttle$TotalFails ~ Shuttle$TempC, data = Shuttle, tau = 0.025)  #lower quantile
rq_high <- rq(Shuttle$TotalFails ~ Shuttle$TempC, data = Shuttle, tau = 0.975)  #upper quantile

Shuttle %>% 
  mutate(low  = rq_low$coefficients[1]  + Shuttle$TempC * rq_low$coefficients[2],
         high = rq_high$coefficients[1] + Shuttle$TempC * rq_high$coefficients[2]) %>% 
  ggplot() +
  geom_point(aes(Shuttle$TempC, Shuttle$TotalFails), size = 4) +
  geom_smooth(aes(Shuttle$TempC, Shuttle$TotalFails), method = "lm") +
  geom_line(aes(Shuttle$TempC, low),  linetype = "dashed", color = "red") +
  geom_line(aes(Shuttle$TempC, high), linetype = "dashed", color = "red") +
  theme_bw() +
  labs(title = "Space Shuttle Launch: Probability of Fails over Outside Temperature",
              subtitle = "Based on Real Data",
              caption = "Data source: NASA",
              x = "Outside Temperature in ºC", 
              y = "Number of Fails") +
    theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 16),
  )

4.7 Performing Logistic Regression

# Load Library if not available
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

# Make 0/1 Column
Shuttle$Fail <- ifelse(Shuttle$TotalFails > 0, 1, 0)

# Run Regression
LogReg3 <- glm(fail.field ~ TempC, data = Shuttle, family = "binomial")

summary(LogReg3)

## 
## Call:
## glm(formula = fail.field ~ TempC, family = "binomial", data = Shuttle)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    7.584      3.915    1.94    0.053 .
## TempC         -0.417      0.194   -2.15    0.032 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28.267  on 22  degrees of freedom
## Residual deviance: 20.335  on 21  degrees of freedom
## AIC: 24.33
## 
## Number of Fisher Scoring iterations: 5

anova(LogReg3)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: fail.field
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                     22       28.3            
## TempC  1     7.93        21       20.3   0.0049 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.8 Showing Plot with CI for Shuttle

# Load Library if not available
if(! "sjPlot" %in% installed.packages()) { install.packages("sjPlot", dependencies = TRUE) }
library(sjPlot)

# Plot Probability with CI
pm3 <- plot_model(LogReg3, 
           type = "pred"
)

pm3 + labs(title = "Space Shuttle Launch: Probability of Fails over Outside Temperature",
              subtitle = "Based on Real Data",
              caption = "Data source: NASA",
              x = "Outside Temperature", 
              y = "Number of Fails") +
    theme(
    plot.title = element_text(hjust = 0.5, size = 20),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    plot.caption = element_text(hjust = 0, face = "italic", size = 10),
    plot.x = element_text(size = 14),
  )

4.9 Showing McFadden’s Rsquare

# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)

pR2(LogReg3)

## fitting null model for pseudo-r2

##      llh  llhNull       G2 McFadden     r2ML     r2CU 
## -10.1674 -14.1336   7.9323   0.2806   0.2917   0.4123

4.10 Predicting the Desaster

cat("\n\nOutside Temperature at launch of Challenger was 36°F, i.e., 2.2°C.\n")

## 
## 
## Outside Temperature at launch of Challenger was 36°F, i.e., 2.2°C.

cat("Until then the lowest Outside Temperature at launch of any Shuttle was", min(round(Shuttle$TempC, 2)), "°C.\n")

## Until then the lowest Outside Temperature at launch of any Shuttle was 11.7 °C.

cat("What is the Risk of an O-Ring Field Failure?\n")

## What is the Risk of an O-Ring Field Failure?

newTemp <- data.frame(TempC = c(2.2))
newData <- data.frame(TempC = c(2.2, 3, 4, 20, 30))


newRisk <- predict(LogReg3, newdata = newTemp, type = "response")

cat("The Risk of Field Failure at 2.2°C Outside Temperature was", round(newRisk * 100, 2), "%.\n\n")

## The Risk of Field Failure at 2.2°C Outside Temperature was 99.87 %.

predictions <- predict(LogReg3, newdata = newData, type = "link", se.fit = TRUE)

alpha <- 0.05
z_value <- qnorm(1 - alpha / 2)
lower_logit <- predictions$fit - z_value * predictions$se.fit
upper_logit <- predictions$fit + z_value * predictions$se.fit

lower_prob <- exp(lower_logit) / (1 + exp(lower_logit))
upper_prob <- exp(upper_logit) / (1 + exp(upper_logit))

results <- data.frame(
  Temperature    = newData$TempC,
  Lower_Prob     = paste(round(lower_prob * 100, 2) , "%"),
  Predicted_Prob = paste(round(exp(predictions$fit) / (1 + exp(predictions$fit)) * 100, 2) , "%"),
  Upper_Prob     = paste(round(upper_prob * 100, 2) , "%")
)

# Insert space between Columns


# Print table with CIs
cat("\n\nSome more prediction data with 95% Confidence Interval:\n\n")

## 
## 
## Some more prediction data with 95% Confidence Interval:

print.data.frame(results)

##   Temperature Lower_Prob Predicted_Prob Upper_Prob
## 1         2.2    45.56 %        99.87 %      100 %
## 2         3.0    44.74 %        99.82 %      100 %
## 3         4.0    43.71 %        99.73 %      100 %
## 4        20.0    13.78 %         32.1 %    58.31 %
## 5        30.0     0.01 %         0.73 %    27.87 %

4.11 Calculating Odds Ratio and 95% CI and Predict for Shuttle

# Calculate Odds Ratio and 95% CI
cat("\n\nThe Odds Ratio for Field Failure at an Outside Temperature of 0.0°C is:", round(exp(coef(LogReg3)[1]), 4) , ".\n")

## 
## 
## The Odds Ratio for Field Failure at an Outside Temperature of 0.0°C is: 1966 .

cat("For every degree increase, the Odds Ratio increases by:", round(exp(coef(LogReg3)[2]), 4) , ".\n\n")

## For every degree increase, the Odds Ratio increases by: 0.6593 .

suppressMessages(exp(cbind(OR = coef(LogReg3), confint(LogReg3))))

##                    OR 2.5 %        97.5 %
## (Intercept) 1965.9744 3.805 52874563.0058
## TempC          0.6593 0.397        0.8967

Data Mining - Predictive Modelling
Classification Using Logistic Regression

Updated by Uwe H Kaufmann on 19 Jan 2025, 18:26

1 Loading Libraries

2 Regression for Disease Data

2.1 Getting Data for Disease

2.2 Exploring Data for Disease

2.3 Performing Linear Regression for Disease

2.4 Performing Logistic Regression for Disease

2.5 Comparing Models for Disease

2.6 Showing Scatter Plot for Disease

2.7 Showing Plot with CI for Disease

2.8 Showing McFadden’s Rsquare for Logistic Regression of Disease

2.9 Calculating Odds Ratio and 95% CI and Predict for Disease

2.10 Calculating Deviance

3 Logistic Regression with TelcoChurn

3.1 Getting Data for TelcoChurn (VMchurn)

3.2 Exploring Data for TelcoChurn

3.3 Splitting Data

3.4 Performing Regression

3.5 Performing Regression with Multiple Xs

3.6 Performing Regression with Multiple Xs

3.7 Showing McFadden’s Rsquare

4 Logistic Regression with Space Shuttle Data

4.1 Getting Data

4.2 Exploring Data for Shuttle

4.3 Calculating Confidence Interval for Pearson’s Correlation Coefficient

4.4 Performing Linear Regression

4.5 Plotting Data with Regression Line

4.6 Plotting Confidence and Prediction Interval

4.7 Performing Logistic Regression

4.8 Showing Plot with CI for Shuttle

4.9 Showing McFadden’s Rsquare

4.10 Predicting the Desaster

4.11 Calculating Odds Ratio and 95% CI and Predict for Shuttle

Data Mining - Predictive ModellingClassification Using Logistic Regression

Updated by Uwe H Kaufmann on 19 Jan 2025, 18:26

1 Loading Libraries

2 Regression for Disease Data

2.1 Getting Data for Disease

2.2 Exploring Data for Disease

2.3 Performing Linear Regression for Disease

2.4 Performing Logistic Regression for Disease

2.5 Comparing Models for Disease

2.6 Showing Scatter Plot for Disease

2.7 Showing Plot with CI for Disease

2.8 Showing McFadden’s Rsquare for Logistic Regression of Disease

2.9 Calculating Odds Ratio and 95% CI and Predict for Disease

2.10 Calculating Deviance

3 Logistic Regression with TelcoChurn

3.1 Getting Data for TelcoChurn (VMchurn)

3.2 Exploring Data for TelcoChurn

3.3 Splitting Data

3.4 Performing Regression

3.5 Performing Regression with Multiple Xs

3.6 Performing Regression with Multiple Xs

3.7 Showing McFadden’s Rsquare

4 Logistic Regression with Space Shuttle Data

4.1 Getting Data

4.2 Exploring Data for Shuttle

4.3 Calculating Confidence Interval for Pearson’s Correlation Coefficient

4.4 Performing Linear Regression

4.5 Plotting Data with Regression Line

4.6 Plotting Confidence and Prediction Interval

4.7 Performing Logistic Regression

4.8 Showing Plot with CI for Shuttle

4.9 Showing McFadden’s Rsquare

4.10 Predicting the Desaster

4.11 Calculating Odds Ratio and 95% CI and Predict for Shuttle

Data Mining - Predictive Modelling
Classification Using Logistic Regression