Regression for Disease
Data
Getting Data for
Disease
# Download Training Data from URL
# Download Test Data from URL
Exploring Data for
Disease
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
Patients$ID <- 1:nrow(Patients)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Patients)
## [1] "PatientID" "Age" "Disease" "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Patients)
## 'data.frame': 20 obs. of 4 variables:
## $ PatientID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 29 30 31 32 41 41 42 44 49 ...
## $ Disease : int 0 0 0 0 0 0 0 0 1 1 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Patients, 5)
## PatientID Age Disease ID
## 1 1 25 0 1
## 2 2 29 0 2
## 3 3 30 0 3
## 4 4 31 0 4
## 5 5 32 0 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Patients, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 PatientID 20 10.5 5.9161 1 5.75 10.5 15.25 20 0
## 2 Age 20 52.95 19.508 25 38.75 49.5 69 84 0
## 3 Disease 20 0.35 0.48936 0 0 0 1 1 0
## 4 ID 20 10.5 5.9161 1 5.75 10.5 15.25 20 0
Comparing Models for
Disease
# Produce AIC table
AIClin <- AIC(LinReg1)
AIClog <- AIC(LogReg1)
cat("\n\nAIC of Linear Regression: ", sprintf(AIClin, fmt = '%#.4f'), "\n")
##
##
## AIC of Linear Regression: 26.9915
cat("AIC of Logistic Regression: ", sprintf(AIClog, fmt = '%#.4f'), "\n")
## AIC of Logistic Regression: 24.2015
# Load Library if not available
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "broom" %in% installed.packages()) { install.packages("broom", dependencies = TRUE) }
library(broom)
glance(LinReg1) %>%
dplyr::select(adj.r.squared, AIC, BIC, p.value)
## # A tibble: 1 × 4
## adj.r.squared AIC BIC p.value
## <dbl> <dbl> <dbl> <dbl>
## 1 0.224 27.0 30.0 0.0202
glance(LogReg1) %>%
dplyr::select(AIC, BIC)
## # A tibble: 1 × 2
## AIC BIC
## <dbl> <dbl>
## 1 24.2 26.2
Showing Scatter Plot
for Disease
# Show Scatter Plot
plot(Patients$Age, Patients$Disease,
xlab = "Age", ylab = "Disease", cex.lab = 1.2,
main = "Disease vs. Age", cex.main = 1.6,
xlim = c(20, 90), pch = 16)
abline(LinReg1, lty = 3, col = "red")
# Add Courve to Plot
curve(predict(LogReg1, data.frame(Age = x),
type = "resp"), add = TRUE, lwd = 2, col = "blue")
legend("topleft", legend = c("Lin", "Log"), col = c("red", "blue"), lty = c(3, 1), cex = 1.2)
Showing Plot with CI
for Disease
# Load Library if not available
if(! "sjPlot" %in% installed.packages()) { install.packages("sjPlot", dependencies = TRUE) }
library(sjPlot)
# Plot Probability with CI
plot_model(LogReg1, type = "pred", axis.lim = c(0, 1))
Showing McFadden’s
Rsquare for Logistic Regression of Disease
# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)
pR2(LogReg1)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -10.1007 -12.9489 5.6964 0.2200 0.2479 0.3414
Calculating Odds
Ratio and 95% CI and Predict for Disease
# Calculate Odds Ratio and 95% CI
exp(cbind(OR = coef(LogReg1), confint(LogReg1)))
## OR 2.5 % 97.5 %
## (Intercept) 0.01262 0.0001111 0.3458
## Age 1.06925 1.0110685 1.1534
# Predict Chance to Have Disease For Test Data
PatientsTest$Prob <- predict(LogReg1, newdata = PatientsTest, type = "response")
cat("\n\nThe Probability of Catching Disease for New Data is Shown in Prob:\n")
##
##
## The Probability of Catching Disease for New Data is Shown in Prob:
PatientsTest
## PatientID Age Disease Prob
## 1 1 26 0 0.06715
## 2 2 31 0 0.09142
## 3 3 31 1 0.09142
## 4 4 32 0 0.09713
## 5 5 35 0 0.11623
## 6 6 41 0 0.16426
## 7 7 41 1 0.16426
## 8 8 42 0 0.17365
## 9 9 82 1 0.75367
## 10 10 83 1 0.76589
Disease50 <- exp(LogReg1$coefficients[1] + LogReg1$coefficients[2] * 50) / (1 + exp(LogReg1$coefficients[1] + LogReg1$coefficients[2] * 50))
cat("\n\nThe Probability of a 50-Year-old Patient Catching the Disease is:", Disease50 * 100, "%.\n")
##
##
## The Probability of a 50-Year-old Patient Catching the Disease is: 26.42 %.
Calculating
Deviance
# Run Regression for Null Model - Only Intercept
LogReg1Null <- glm(Disease ~ 1, data = Patients, family = binomial(link = 'logit'))
summary(LogReg1Null)
##
## Call:
## glm(formula = Disease ~ 1, family = binomial(link = "logit"),
## data = Patients)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.619 0.469 -1.32 0.19
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25.898 on 19 degrees of freedom
## Residual deviance: 25.898 on 19 degrees of freedom
## AIC: 27.9
##
## Number of Fisher Scoring iterations: 4
anova(LogReg1Null)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Disease
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 25.9
# Compute Rsq
r2log <- 1 - LogReg1$deviance / LogReg1$null.deviance
cat("\n\nThe R2 of this Logistic Regression Model is:", r2log * 100, "%.\n")
##
##
## The R2 of this Logistic Regression Model is: 22 %.
Logistic Regression
with TelcoChurn
Getting Data for
TelcoChurn (VMchurn)
# Download Training Data from URL
Exploring Data for
TelcoChurn
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
VMchurn$ID <- 1:nrow(VMchurn)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(VMchurn)
## [1] "CustomerID" "Gender" "SeniorCitizen"
## [4] "MaritalStatus" "Dependents" "Tenure"
## [7] "PhoneService" "MultipleLines" "InternetService"
## [10] "OnlineSecurity" "OnlineBackup" "DeviceProtection"
## [13] "TechSupport" "StreamingTV" "StreamingMovies"
## [16] "Contract" "PaperlessBilling" "PaymentMethod"
## [19] "InternationalPlan" "VoiceMailPlan" "NumbervMailMessages"
## [22] "TotalDayMinutes" "TotalDayCalls" "TotalEveMinutes"
## [25] "TotalEveCalls" "TotalNightMinutes" "TotalNightCalls"
## [28] "TotalIntlMinutes" "TotalIntlCalls" "CustomerServiceCalls"
## [31] "TotalCall" "TotalRevenue" "Churn"
## [34] "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(VMchurn)
## 'data.frame': 3333 obs. of 34 variables:
## $ CustomerID : Factor w/ 3333 levels "0002-ORFBO","0004-TLHLJ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 2 1 1 ...
## $ SeniorCitizen : int 0 0 0 1 1 0 1 1 0 0 ...
## $ MaritalStatus : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 1 1 2 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 2 ...
## $ Tenure : int 9 4 9 71 7 5 1 45 3 4 ...
## $ PhoneService : Factor w/ 1 level "Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ MultipleLines : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 2 1 2 1 2 2 1 3 3 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 3 1 1 3 2 2 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 1 1 2 2 ...
## $ DeviceProtection : Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 1 1 3 2 2 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 3 1 3 3 1 1 1 1 2 2 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 3 1 3 3 1 1 1 1 2 2 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 1 1 1 3 2 2 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 2 1 1 3 1 1 1 2 1 1 ...
## $ PaperlessBilling : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 1 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 3 2 1 3 3 3 2 4 4 ...
## $ InternationalPlan : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 1 ...
## $ VoiceMailPlan : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ NumbervMailMessages : int 0 0 36 0 0 0 0 0 0 0 ...
## $ TotalDayMinutes : num 168.8 122.2 178.7 190.2 67.7 ...
## $ TotalDayCalls : int 137 112 134 68 68 95 55 133 158 99 ...
## $ TotalEveMinutes : num 241 132 179 262 196 ...
## $ TotalEveCalls : int 107 94 102 64 86 128 124 86 120 93 ...
## $ TotalNightMinutes : num 205 170 127 130 236 ...
## $ TotalNightCalls : int 106 106 82 92 137 105 81 80 46 106 ...
## $ TotalIntlMinutes : num 15.5 10.3 8 8.8 12 12.9 10 11.5 12.4 8 ...
## $ TotalIntlCalls : int 4 9 4 4 2 5 7 3 3 4 ...
## $ CustomerServiceCalls: int 0 5 2 0 1 3 3 0 1 1 ...
## $ TotalCall : int 354 326 324 228 294 336 270 302 328 303 ...
## $ TotalRevenue : num 593 281 572 7904 340 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 1 2 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(VMchurn, 5)
## CustomerID Gender SeniorCitizen MaritalStatus Dependents Tenure PhoneService
## 1 0002-ORFBO Female 0 Yes Yes 9 Yes
## 2 0004-TLHLJ Male 0 No No 4 Yes
## 3 0013-MHZWF Female 0 No Yes 9 Yes
## 4 0013-SMEOE Female 1 Yes No 71 Yes
## 5 0015-UOCOJ Female 1 No No 7 Yes
## MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No DSL No Yes No
## 2 No Fiber optic No No Yes
## 3 No DSL No No No
## 4 No Fiber optic Yes Yes Yes
## 5 No DSL Yes No No
## TechSupport StreamingTV StreamingMovies Contract PaperlessBilling
## 1 Yes Yes No One year Yes
## 2 No No No Month-to-month Yes
## 3 Yes Yes Yes Month-to-month Yes
## 4 Yes Yes Yes Two year Yes
## 5 No No No Month-to-month Yes
## PaymentMethod InternationalPlan VoiceMailPlan NumbervMailMessages
## 1 Mailed check No No 0
## 2 Electronic check Yes No 0
## 3 Credit card (automatic) No Yes 36
## 4 Bank transfer (automatic) No No 0
## 5 Electronic check No No 0
## TotalDayMinutes TotalDayCalls TotalEveMinutes TotalEveCalls TotalNightMinutes
## 1 168.8 137 241.4 107 204.8
## 2 122.2 112 131.7 94 169.5
## 3 178.7 134 178.6 102 126.8
## 4 190.2 68 262.2 64 130.0
## 5 67.7 68 195.7 86 236.5
## TotalNightCalls TotalIntlMinutes TotalIntlCalls CustomerServiceCalls
## 1 106 15.5 4 0
## 2 106 10.3 9 5
## 3 82 8.0 4 2
## 4 92 8.8 4 0
## 5 137 12.0 2 1
## TotalCall TotalRevenue Churn ID
## 1 354 593.3 No 1
## 2 326 280.9 Yes 2
## 3 324 571.5 No 3
## 4 228 7904.3 No 4
## 5 294 340.4 No 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(VMchurn[, -1], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 4,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75
## 1 Gender 3333
## 2 ... Female 1621 48.63%
## 3 ... Male 1712 51.37%
## 4 SeniorCitizen 3333 0.1101 0.3131 0 0 0 0
## 5 MaritalStatus 3333
## 6 ... No 1831 54.94%
## 7 ... Yes 1502 45.06%
## 8 Dependents 3333
## 9 ... No 2237 67.12%
## 10 ... Yes 1096 32.88%
## 11 Tenure 3333 28 23.26 0 7 23 48
## 12 PhoneService 3333
## 13 ... Yes 3333 100%
## 14 MultipleLines 3333
## 15 ... No 3024 90.73%
## 16 ... Yes 309 9.27%
## 17 InternetService 3333
## 18 ... DSL 1036 31.08%
## 19 ... Fiber optic 1118 33.54%
## 20 ... No 1179 35.37%
## 21 OnlineSecurity 3333
## 22 ... No 1356 40.68%
## 23 ... No internet service 1179 35.37%
## 24 ... Yes 798 23.94%
## 25 OnlineBackup 3333
## 26 ... No 1289 38.67%
## 27 ... No internet service 1179 35.37%
## 28 ... Yes 865 25.95%
## 29 DeviceProtection 3333
## 30 ... No 1320 39.6%
## 31 ... No internet service 1179 35.37%
## 32 ... Yes 834 25.02%
## 33 TechSupport 3333
## 34 ... No 1353 40.59%
## 35 ... No internet service 1179 35.37%
## 36 ... Yes 801 24.03%
## 37 StreamingTV 3333
## 38 ... No 1266 37.98%
## 39 ... No internet service 1179 35.37%
## 40 ... Yes 888 26.64%
## 41 StreamingMovies 3333
## 42 ... No 1242 37.26%
## 43 ... No internet service 1179 35.37%
## 44 ... Yes 912 27.36%
## 45 Contract 3333
## 46 ... Month-to-month 1790 53.71%
## 47 ... One year 762 22.86%
## 48 ... Two year 781 23.43%
## 49 PaperlessBilling 3333
## 50 ... No 1638 49.14%
## 51 ... Yes 1695 50.86%
## 52 PaymentMethod 3333
## 53 ... Bank transfer (automatic) 694 20.82%
## 54 ... Credit card (automatic) 704 21.12%
## 55 ... Electronic check 887 26.61%
## 56 ... Mailed check 1048 31.44%
## 57 InternationalPlan 3333
## 58 ... No 3010 90.31%
## 59 ... Yes 323 9.69%
## 60 VoiceMailPlan 3333
## 61 ... No 2411 72.34%
## 62 ... Yes 922 27.66%
## 63 NumbervMailMessages 3333 8.099 13.69 0 0 0 20
## 64 TotalDayMinutes 3333 179.8 54.47 0 143.7 179.4 216.4
## 65 TotalDayCalls 3333 100.4 20.07 0 87 101 114
## 66 TotalEveMinutes 3333 201 50.71 0 166.6 201.4 235.3
## 67 TotalEveCalls 3333 100.1 19.92 0 87 100 114
## 68 TotalNightMinutes 3333 200.9 50.57 23.2 167 201.2 235.3
## 69 TotalNightCalls 3333 100.1 19.57 33 87 100 113
## 70 TotalIntlMinutes 3333 10.24 2.792 0 8.5 10.3 12.1
## 71 TotalIntlCalls 3333 4.479 2.461 0 3 4 6
## 72 CustomerServiceCalls 3333 1.563 1.315 0 1 1 2
## 73 TotalCall 3333 306.7 34.45 194 284 307 330
## 74 TotalRevenue 3328 1673 1920 18.8 252.6 892.5 2434
## 75 Churn 3333
## 76 ... No 2850 85.51%
## 77 ... Yes 483 14.49%
## 78 ID 3333 1667 962.3 1 834 1667 2500
## Max NA Mode
## 1
## 2
## 3
## 4 1 0
## 5
## 6
## 7
## 8
## 9
## 10
## 11 72 0
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
## 51
## 52
## 53
## 54
## 55
## 56
## 57
## 58
## 59
## 60
## 61
## 62
## 63 51 0
## 64 350.8 0
## 65 165 0
## 66 363.7 0
## 67 170 0
## 68 395 0
## 69 175 0
## 70 20 0
## 71 20 0
## 72 9 0
## 73 418 0
## 74 8476 0.0015001500150015
## 75
## 76
## 77
## 78 3333 0
Splitting Data
# Loading other packages if not available
if(! "rsample" %in% installed.packages()) { install.packages("rsample", dependencies = TRUE) }
library(rsample)
# Convert X and Y in Numeric Columns
VMchurn$ChurnNum <- ifelse(VMchurn$Churn == 'No', 0, 1)
VMchurn$VMplanNum <- ifelse(VMchurn$VoiceMailPlan == 'No', 0, 1)
VMchurn$CSChig <- ifelse(VMchurn$CustomerServiceCalls > 3, 1, 0)
VMchurn$CSCmed <- ifelse(VMchurn$CustomerServiceCalls > 1 & VMchurn$CustomerServiceCalls < 4, 1, 0)
# Split data into train and test
set.seed(1)
split <- initial_split(VMchurn, prop = 0.80)
VMtrain <- training(split)
VMtest <- testing(split)
# Show Split Data
cat("\n\nShow Structure of the Training Data Frame:\n")
##
##
## Show Structure of the Training Data Frame:
cat("Training Data Frame includes ", nrow(VMtrain), " rows and ", ncol(VMtrain), " columns.")
## Training Data Frame includes 2666 rows and 38 columns.
cat("\n\nShow Structure of the Testing Data Frame:\n")
##
##
## Show Structure of the Testing Data Frame:
cat("Testing Data Frame includes ", nrow(VMtest), " rows and ", ncol(VMtest), " columns.")
## Testing Data Frame includes 667 rows and 38 columns.
Showing McFadden’s
Rsquare
# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)
pR2(LogReg3)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -776.4689 -1106.0906 659.2435 0.2980 0.2191 0.3885
Logistic Regression
with Space Shuttle Data
Getting Data
# Download Training Data from URL
Exploring Data for
Shuttle
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
# Add ID to Data Frame
Shuttle$ID <- 1:nrow(Shuttle)
# Show Characteristics of Data Frame
cat("\n\nColumns Available in Data Frame:\n")
##
##
## Columns Available in Data Frame:
names(Shuttle)
## [1] "Flights" "Date" "nfails.field" "nfails.nozzle"
## [5] "fail.field" "fail.nozzle" "TempC" "TotalFails"
## [9] "ID"
cat("\n\nShow Structure of the Data Frame:\n")
##
##
## Show Structure of the Data Frame:
str(Shuttle)
## 'data.frame': 23 obs. of 9 variables:
## $ Flights : Factor w/ 23 levels "1","2","3","41-B",..: 1 2 3 8 17 21 22 23 4 5 ...
## $ Date : Factor w/ 23 levels "03 Feb 1984",..: 8 11 14 7 3 13 21 18 1 5 ...
## $ nfails.field : int 0 1 0 0 0 0 0 0 1 1 ...
## $ nfails.nozzle: int 0 0 0 0 2 0 0 0 1 1 ...
## $ fail.field : int 0 1 0 0 0 0 0 0 1 1 ...
## $ fail.nozzle : int 0 0 0 0 1 0 0 0 1 1 ...
## $ TempC : num 18.9 21.1 20.6 20 19.4 22.2 22.8 21.1 13.9 17.2 ...
## $ TotalFails : int 0 1 0 0 3 0 0 0 3 3 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
cat("\n\nFirst 5 Rows of Data Frame:\n")
##
##
## First 5 Rows of Data Frame:
head(Shuttle, 5)
## Flights Date nfails.field nfails.nozzle fail.field fail.nozzle TempC
## 1 1 12 Apr 1981 0 0 0 0 18.9
## 2 2 12 Nov 1981 1 0 1 0 21.1
## 3 3 22 Mar 1982 0 0 0 0 20.6
## 4 5 11 Nov 1982 0 0 0 0 20.0
## 5 6 04 Apr 1983 0 2 0 1 19.4
## TotalFails ID
## 1 0 1
## 2 1 2
## 3 0 3
## 4 0 4
## 5 3 5
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Shuttle[, -c(1, 2)], add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max NA Mode
## 1 nfails.field 23 0.3913 0.65638 0 0 0 1 2 0
## 2 nfails.nozzle 23 0.73913 0.91539 0 0 0 2 2 0
## 3 fail.field 23 0.30435 0.47047 0 0 0 1 1 0
## 4 fail.nozzle 23 0.43478 0.50687 0 0 0 1 1 0
## 5 TempC 23 20.861 3.9195 11.7 19.4 21.1 23.9 27.2 0
## 6 TotalFails 23 1.5652 1.674 0 0 1 3 5 0
## 7 ID 23 12 6.7823 1 6.5 12 17.5 23 0
Calculating
Confidence Interval for Pearson’s Correlation Coefficient
# Loading Package if Not Available
if(! "confintr" %in% installed.packages()) { install.packages("confintr", dependencies = TRUE) }
library(confintr)
# Calculate Correlation Coefficient
ci_cor(
x = Shuttle$TempC,
y = Shuttle$TotalFails,
probs = c(0.025, 0.975),
method = "pearson"
)
##
## Two-sided 95% normal confidence interval for the true Pearson
## correlation coefficient
##
## Sample estimate: -0.5874
## Confidence interval:
## 2.5% 97.5%
## -0.8048 -0.2312
Plotting Data with
Regression Line
# Plot Data with Regression Line
plot(Shuttle$TempC, Shuttle$TotalFails,
main = "Space Shuttle: Scatter Plot of Outside Temperature vs Total Fails",
xlab = "Outside Temperature",
ylab = "Number of Fails",
pch = 16,
col = "blue")
abline(LinReg3, col = "red")
grid(col = "lightgrey", lty = "dotted")
Plotting Confidence
and Prediction Interval
# Loading other packages
if(! "tidyverse" %in% installed.packages()) { install.packages("tidyverse", dependencies = TRUE) }
library(tidyverse)
if(! "quantreg" %in% installed.packages()) { install.packages("quantreg", dependencies = TRUE) }
library(quantreg)
# 95% quantile, two tailed
rq_low <- rq(Shuttle$TotalFails ~ Shuttle$TempC, data = Shuttle, tau = 0.025) #lower quantile
rq_high <- rq(Shuttle$TotalFails ~ Shuttle$TempC, data = Shuttle, tau = 0.975) #upper quantile
Shuttle %>%
mutate(low = rq_low$coefficients[1] + Shuttle$TempC * rq_low$coefficients[2],
high = rq_high$coefficients[1] + Shuttle$TempC * rq_high$coefficients[2]) %>%
ggplot() +
geom_point(aes(Shuttle$TempC, Shuttle$TotalFails), size = 4) +
geom_smooth(aes(Shuttle$TempC, Shuttle$TotalFails), method = "lm") +
geom_line(aes(Shuttle$TempC, low), linetype = "dashed", color = "red") +
geom_line(aes(Shuttle$TempC, high), linetype = "dashed", color = "red") +
theme_bw() +
labs(title = "Space Shuttle Launch: Probability of Fails over Outside Temperature",
subtitle = "Based on Real Data",
caption = "Data source: NASA",
x = "Outside Temperature in ºC",
y = "Number of Fails") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 16),
)
Showing Plot with CI
for Shuttle
# Load Library if not available
if(! "sjPlot" %in% installed.packages()) { install.packages("sjPlot", dependencies = TRUE) }
library(sjPlot)
# Plot Probability with CI
pm3 <- plot_model(LogReg3,
type = "pred"
)
pm3 + labs(title = "Space Shuttle Launch: Probability of Fails over Outside Temperature",
subtitle = "Based on Real Data",
caption = "Data source: NASA",
x = "Outside Temperature",
y = "Number of Fails") +
theme(
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 14),
plot.caption = element_text(hjust = 0, face = "italic", size = 10),
plot.x = element_text(size = 14),
)
Showing McFadden’s
Rsquare
# Load Library if not available
if(! "pscl" %in% installed.packages()) { install.packages("pscl", dependencies = TRUE) }
library(pscl)
pR2(LogReg3)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -10.1674 -14.1336 7.9323 0.2806 0.2917 0.4123
Predicting the
Desaster
cat("\n\nOutside Temperature at launch of Challenger was 36°F, i.e., 2.2°C.\n")
##
##
## Outside Temperature at launch of Challenger was 36°F, i.e., 2.2°C.
cat("Until then the lowest Outside Temperature at launch of any Shuttle was", min(round(Shuttle$TempC, 2)), "°C.\n")
## Until then the lowest Outside Temperature at launch of any Shuttle was 11.7 °C.
cat("What is the Risk of an O-Ring Field Failure?\n")
## What is the Risk of an O-Ring Field Failure?
newTemp <- data.frame(TempC = c(2.2))
newData <- data.frame(TempC = c(2.2, 3, 4, 20, 30))
newRisk <- predict(LogReg3, newdata = newTemp, type = "response")
cat("The Risk of Field Failure at 2.2°C Outside Temperature was", round(newRisk * 100, 2), "%.\n\n")
## The Risk of Field Failure at 2.2°C Outside Temperature was 99.87 %.
predictions <- predict(LogReg3, newdata = newData, type = "link", se.fit = TRUE)
alpha <- 0.05
z_value <- qnorm(1 - alpha / 2)
lower_logit <- predictions$fit - z_value * predictions$se.fit
upper_logit <- predictions$fit + z_value * predictions$se.fit
lower_prob <- exp(lower_logit) / (1 + exp(lower_logit))
upper_prob <- exp(upper_logit) / (1 + exp(upper_logit))
results <- data.frame(
Temperature = newData$TempC,
Lower_Prob = paste(round(lower_prob * 100, 2) , "%"),
Predicted_Prob = paste(round(exp(predictions$fit) / (1 + exp(predictions$fit)) * 100, 2) , "%"),
Upper_Prob = paste(round(upper_prob * 100, 2) , "%")
)
# Insert space between Columns
# Print table with CIs
cat("\n\nSome more prediction data with 95% Confidence Interval:\n\n")
##
##
## Some more prediction data with 95% Confidence Interval:
print.data.frame(results)
## Temperature Lower_Prob Predicted_Prob Upper_Prob
## 1 2.2 45.56 % 99.87 % 100 %
## 2 3.0 44.74 % 99.82 % 100 %
## 3 4.0 43.71 % 99.73 % 100 %
## 4 20.0 13.78 % 32.1 % 58.31 %
## 5 30.0 0.01 % 0.73 % 27.87 %
Calculating Odds
Ratio and 95% CI and Predict for Shuttle
# Calculate Odds Ratio and 95% CI
cat("\n\nThe Odds Ratio for Field Failure at an Outside Temperature of 0.0°C is:", round(exp(coef(LogReg3)[1]), 4) , ".\n")
##
##
## The Odds Ratio for Field Failure at an Outside Temperature of 0.0°C is: 1966 .
cat("For every degree increase, the Odds Ratio increases by:", round(exp(coef(LogReg3)[2]), 4) , ".\n\n")
## For every degree increase, the Odds Ratio increases by: 0.6593 .
suppressMessages(exp(cbind(OR = coef(LogReg3), confint(LogReg3))))
## OR 2.5 % 97.5 %
## (Intercept) 1965.9744 3.805 52874563.0058
## TempC 0.6593 0.397 0.8967