1 Load Libraries

# Loading necessary packages. Being installed if not installed before.

## Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)

## Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)

2 Make Settings

# Global Settings
options(digits =   4)
options(scipen = 999)

# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")

# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")

3 Read Dataset

4 Lesson 01

4.1 Setting Work Directory

# If working directory is set, you only need to include filename of the file you wish to open here 
cars  <- read.csv("cars.csv", stringsAsFactors = FALSE)

# Write data to working directory 

4.2 To display the whole dataset, type the dataset name

cat("\n\nShow first parts of car dataset: \n")
## 
## 
## Show first parts of car dataset:
head(cars) # Display the first few records of a dataset
##    mpg cylinders cubicinches  hp weight time.to.60 year    brand l.100km
## 1 14.0         8         350 165   4209         12 1972      US.      17
## 2 31.9         4          89  71   1925         14 1980  Europe.       7
## 3 17.0         8         302 140   3449         11 1971      US.      14
## 4 15.0         8         400 150   3761         10 1971      US.      16
## 5 30.5         4          98  63   2051         17 1978      US.       8
## 6 23.0         8         350 125   3900         17 1980      US.      10
##   weightkg  ccm
## 1     1909 5735
## 2      873 1458
## 3     1564 4949
## 4     1706 6555
## 5      930 1606
## 6     1769 5735
cat("\n\nShow column names of cars: \n")
## 
## 
## Show column names of cars:
names(cars) # Display variable names of a data frame, one kind of data in R
##  [1] "mpg"         "cylinders"   "cubicinches" "hp"          "weight"     
##  [6] "time.to.60"  "year"        "brand"       "l.100km"     "weightkg"   
## [11] "ccm"
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)

cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
## 
## 
## Descriptive Statistics of Columns in Data Frame:
st(cars, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
   title='Summary Statistics',
   summ = list(
     c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
     c('notNA(x)','mean(x)')
   ),
   summ.names = list(
     c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
     c('Count','Percent')
   )
)
##        Variable   N    Mean     SD  Min    P25    P50    P75  Max
## 1           mpg 261   25.16 33.537   10   16.9     22     29  550
## 2     cylinders 261    5.59 1.7333    3      4      6      8    8
## 3   cubicinches 259  200.92 109.26   68   99.5    156    303  455
## 4            hp 261  106.36   40.5   46     75     95    138  230
## 5        weight 258  3001.1 872.94   19 2245.2 2867.5   3670 4997
## 6    time.to.60 261  15.548 2.9106    8     14     16     17   25
## 7          year 261  1976.8 3.6377 1971   1974   1977   1980 1983
## 8         brand 261                                              
## 9  ...  Europe.  48 18.391%                                      
## 10  ...  Japan.  51  19.54%                                      
## 11     ...  US. 162 62.069%                                      
## 12      l.100km 261  11.383 3.8758    5      8     11     14   24
## 13     weightkg 258  1365.2  387.4  732 1019.8 1300.5   1665 2267
## 14          ccm 259  3292.5 1790.4 1114 1630.5   2556 4965.5 7456
##                     NA Mode
## 1                    0     
## 2                    0     
## 3  0.00766283524904215     
## 4                    0     
## 5   0.0114942528735632     
## 6                    0     
## 7                    0     
## 8                          
## 9                          
## 10                         
## 11                         
## 12                   0     
## 13  0.0114942528735632     
## 14 0.00766283524904215
# Show Characteristics of Data Frame
knitr::kable(head(cars), format = "html", caption = "Sample Data from cars") %>%
   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from cars
mpg cylinders cubicinches hp weight time.to.60 year brand l.100km weightkg ccm
14.0 8 350 165 4209 12 1972 US. 17 1909 5735
31.9 4 89 71 1925 14 1980 Europe. 7 873 1458
17.0 8 302 140 3449 11 1971 US. 14 1564 4949
15.0 8 400 150 3761 10 1971 US. 16 1706 6555
30.5 4 98 63 2051 17 1978 US. 8 930 1606
23.0 8 350 125 3900 17 1980 US. 10 1769 5735
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)

# Generate a nice Summary
print(dfSummary(cars), method = 'render')

Data Frame Summary

cars

Dimensions: 261 x 11
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 mpg [numeric]
Mean (sd) : 25.2 (33.5)
min ≤ med ≤ max:
10 ≤ 22 ≤ 550
IQR (CV) : 12.1 (1.3)
104 distinct values 261 (100.0%) 0 (0.0%)
2 cylinders [integer]
Mean (sd) : 5.6 (1.7)
min ≤ med ≤ max:
3 ≤ 6 ≤ 8
IQR (CV) : 4 (0.3)
3:2(0.8%)
4:125(47.9%)
5:3(1.1%)
6:55(21.1%)
8:76(29.1%)
261 (100.0%) 0 (0.0%)
3 cubicinches [integer]
Mean (sd) : 200.9 (109.3)
min ≤ med ≤ max:
68 ≤ 156 ≤ 455
IQR (CV) : 203.5 (0.5)
74 distinct values 259 (99.2%) 2 (0.8%)
4 hp [integer]
Mean (sd) : 106.4 (40.5)
min ≤ med ≤ max:
46 ≤ 95 ≤ 230
IQR (CV) : 63 (0.4)
85 distinct values 261 (100.0%) 0 (0.0%)
5 weight [integer]
Mean (sd) : 3001 (872.9)
min ≤ med ≤ max:
19 ≤ 2868 ≤ 4997
IQR (CV) : 1425 (0.3)
239 distinct values 258 (98.9%) 3 (1.1%)
6 time.to.60 [integer]
Mean (sd) : 15.5 (2.9)
min ≤ med ≤ max:
8 ≤ 16 ≤ 25
IQR (CV) : 3 (0.2)
17 distinct values 261 (100.0%) 0 (0.0%)
7 year [integer]
Mean (sd) : 1977 (3.6)
min ≤ med ≤ max:
1971 ≤ 1977 ≤ 1983
IQR (CV) : 6 (0)
13 distinct values 261 (100.0%) 0 (0.0%)
8 brand [character]
1.  
·
Europe.
2.  
·
Japan.
3.  
·
US.
48(18.4%)
51(19.5%)
162(62.1%)
261 (100.0%) 0 (0.0%)
9 l.100km [integer]
Mean (sd) : 11.4 (3.9)
min ≤ med ≤ max:
5 ≤ 11 ≤ 24
IQR (CV) : 6 (0.3)
17 distinct values 261 (100.0%) 0 (0.0%)
10 weightkg [integer]
Mean (sd) : 1365 (387.4)
min ≤ med ≤ max:
732 ≤ 1300 ≤ 2267
IQR (CV) : 645.2 (0.3)
225 distinct values 258 (98.9%) 3 (1.1%)
11 ccm [integer]
Mean (sd) : 3292 (1790)
min ≤ med ≤ max:
1114 ≤ 2556 ≤ 7456
IQR (CV) : 3335 (0.5)
74 distinct values 259 (99.2%) 2 (0.8%)

Generated by summarytools 1.0.1 (R version 4.4.1)
2024-11-18

4.3 Create Matrices

# Create a matrix with three rows, two columns, and every value equal to 0.0
mat <- matrix(0.0, nrow = 3, ncol = 2); 
cat("\n\nMatrix with 3 rows and 2 columns: \n")
## 
## 
## Matrix with 3 rows and 2 columns:
mat
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    0
## [3,]    0    0
colnames(mat) <- c("Var1", "Var2")  # Give a matrix variable names
cat("\n\nVariable names of the matrix: \n")
## 
## 
## Variable names of the matrix:
colnames(mat)                       # Display variable names of a matrix
## [1] "Var1" "Var2"

4.4 Subset data and declare new variables

cars.rsub <- cars[1:5,] # Subset the data by rows
cat("\n\nFirst 5 rows of cars: \n")
## 
## 
## First 5 rows of cars:
cars.rsub
##    mpg cylinders cubicinches  hp weight time.to.60 year    brand l.100km
## 1 14.0         8         350 165   4209         12 1972      US.      17
## 2 31.9         4          89  71   1925         14 1980  Europe.       7
## 3 17.0         8         302 140   3449         11 1971      US.      14
## 4 15.0         8         400 150   3761         10 1971      US.      16
## 5 30.5         4          98  63   2051         17 1978      US.       8
##   weightkg  ccm
## 1     1909 5735
## 2      873 1458
## 3     1564 4949
## 4     1706 6555
## 5      930 1606
cars.rcsub <- cars[c(1,3,5), c(2,4)] # Subset by specific rows and columns 
cat("\n\nRows 1, 3 and 5 of columns 2 and 4 of cars: \n")
## 
## 
## Rows 1, 3 and 5 of columns 2 and 4 of cars:
cars.rcsub
##   cylinders  hp
## 1         8 165
## 3         8 140
## 5         4  63
cars.vsub <- cars[which(cars$mpg > 40),] # Subset by a logical condition 
cat("\n\nAll datasets with mpg > 40 of cars: \n")
## 
## 
## All datasets with mpg > 40 of cars:
cars.vsub
##       mpg cylinders cubicinches hp weight time.to.60 year    brand l.100km
## 20  550.0         4         107 90   2430         15 1971  Europe.      10
## 64   46.6         4          86 65   2110         18 1981   Japan.       5
## 107  43.4         4          90 48   2335         24 1981  Europe.       5
## 196  41.5         4          98 76   2144         15 1981  Europe.       6
## 198  43.1         4          90 48   1985         22 1979  Europe.       5
## 207  40.8         4          85 65   2110         19 1981   Japan.       6
## 236  44.0         4          97 52   2130         25 1983  Europe.       5
## 248  44.3         4          90 48   2085         22 1981  Europe.       5
##     weightkg  ccm
## 20      1102 1753
## 64       957 1409
## 107     1059 1475
## 196      973 1606
## 198      900 1475
## 207      957 1393
## 236      966 1590
## 248      946 1475
# To declare new variables, type the variable name, a left-arrow, then the value of the variable 
weight <- cars[which(cars$mpg > 50),]$weight
mpg <- cars[which(cars$mpg > 50),]$mpg
cat("\nMiles/Gallon:", mpg)
## 
## Miles/Gallon: 550
cat("\nWeight:", weight)
## 
## Weight: 2430
# Display more than one figure at a time
par(mfrow = c(1,1)) # plots one figure; the default setting 
par(mfrow = c(2,3)) # plots six figures: three in the top row, three in the bottom row 

# Plots will fill the plot space row by row

5 Lesson 2

5.1 Deal with Missing Data

# Install a package if not installed then load it
if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)

# Look at four variables from cars
cars.4var <- cars[,c(1, 3, 4, 8)]
cat("\n\nHead of cars with 4 columns: \n")
## 
## 
## Head of cars with 4 columns:
head(cars.4var)
##    mpg cubicinches  hp    brand
## 1 14.0         350 165      US.
## 2 31.9          89  71  Europe.
## 3 17.0         302 140      US.
## 4 15.0         400 150      US.
## 5 30.5          98  63      US.
## 6 23.0         350 125      US.
# Make certain entries missing
cars.4var[2, 2] <- cars.4var[4, 4] <- NA
cat("\n\nHead of cars with 4 columns and NA at 2,2 and 4,4: \n")
## 
## 
## Head of cars with 4 columns and NA at 2,2 and 4,4:
head(cars.4var)
##    mpg cubicinches  hp    brand
## 1 14.0         350 165      US.
## 2 31.9          NA  71  Europe.
## 3 17.0         302 140      US.
## 4 15.0         400 150     <NA>
## 5 30.5          98  63      US.
## 6 23.0         350 125      US.
# Replace missing values with constants
cars.4var[2,2] <- 0
cars.4var[4,4] <- "Missing"
cat("\n\nHead of cars with 4 columns and missing values replaced with '0' and 'Missing': \n")
## 
## 
## Head of cars with 4 columns and missing values replaced with '0' and 'Missing':
head(cars.4var) 
##    mpg cubicinches  hp    brand
## 1 14.0         350 165      US.
## 2 31.9           0  71  Europe.
## 3 17.0         302 140      US.
## 4 15.0         400 150  Missing
## 5 30.5          98  63      US.
## 6 23.0         350 125      US.
# Replace NA with mean 
cars.4var[2,2] <- mean(na.omit(cars.4var$cubicinches))
cat("\n\nHead of cars with 2,2 replaced by mean: \n")
## 
## 
## Head of cars with 2,2 replaced by mean:
head(cars.4var)
##    mpg cubicinches  hp    brand
## 1 14.0       350.0 165      US.
## 2 31.9       200.6  71  Europe.
## 3 17.0       302.0 140      US.
## 4 15.0       400.0 150  Missing
## 5 30.5        98.0  63      US.
## 6 23.0       350.0 125      US.
# Replace NA with mode 
our_table <- table(cars.4var$brand)
our_mode <- names(our_table) [our_table == max(our_table)]
cars.4var[4,4] <- our_mode
cat("\n\nHead of cars with 4,4 replaced by mode: \n")
## 
## 
## Head of cars with 4,4 replaced by mode:
head(cars.4var)
##    mpg cubicinches  hp    brand
## 1 14.0       350.0 165      US.
## 2 31.9       200.6  71  Europe.
## 3 17.0       302.0 140      US.
## 4 15.0       400.0 150      US.
## 5 30.5        98.0  63      US.
## 6 23.0       350.0 125      US.
# Replace every NA in weight with overall mean
cars$weight   <- na.aggregate(cars$weight) 

# Replace NA and empty with mean in weightkg
meanVal <- mean(cars$weightkg, na.rm = TRUE)
cars$weightkg[is.na(cars$weightkg)] <- meanVal

# Write data to working directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
write.csv(cars, file = "carsFixed.csv")

5.2 Deal with Outliers

# Install a package if not installed then load it
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)

# Display boxplot
cat("\n\nCHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.")
## 
## 
## CHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.
cat("\n\nShow boxplot to identify potential outliers of dataset CHL: \n")
## 
## 
## Show boxplot to identify potential outliers of dataset CHL:
boxplot(Time.in.min ~ Team, data = CHL, col=(c("lightgreen", "orange", "pink", "yellow")), main="Delivery Time by Team", xlab = "Team", ylab = "Delivery time", ylim = c(0, 50), varwidth = TRUE)

# Add grid
grid(nx = NA, ny = NULL, 5, lwd = 1, col = "lightgrey") # grid only in y-direction

# Show Descriptive Stats
cat("\n\nShow descriptive stats to identify potential outliers and missing values in dataset CHL: \n")
## 
## 
## Show descriptive stats to identify potential outliers and missing values in dataset CHL:
summary(CHL$Time.in.min)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.30    5.50    7.50    8.06    9.50   42.00       3

6 Create Histogram and Scatter Plot

# Set up the plot area
par(mfrow = c(1,1))

cat("\n\nDisplay histogram of cars dataset to identify potential outliers: \n")
## 
## 
## Display histogram of cars dataset to identify potential outliers:
# Create the histogram bars
hist(cars$weight, 
  breaks = 30, 
  xlim = c(0,5000), 
  xlab = "Weight",
  ylim = c(0,40),
  ylab = "Counts",
  col =  "lightblue", 
  border = "black",
  main = "Histogram of car weights")

# Make a box around the plot
box(which = "plot", lty = "solid", col = "black")

# Set up the plot area
par(mfrow = c(1,1))

cat("\n\nDisplay scatter plot of weight against MPG of cars dataset to identify potential outliers: \n")
## 
## 
## Display scatter plot of weight against MPG of cars dataset to identify potential outliers:
# Create the scatter plot
plot(cars$weight, cars$mpg,
xlim = c(0,5000), 
xlab = "Weight",
ylim = c(0,600),
ylab = "MPG",
type = "p",
pch = 16,
col = "lightblue", 
border = "black",
main = "Scatter Plot of MPG by Weight")

# Add open black circles
points(cars$weight,cars$mpg, type = "p", col = "black")

6.1 Perform Descriptive Statistics

cat("\n\nDisplay descriptive statistics of cars dataset:\n")
## 
## 
## Display descriptive statistics of cars dataset:
cat("\nMean: ", mean(cars$weight))
## 
## Mean:  3001
cat("\nMedian: ", median(cars$weight))
## 
## Median:  2904
cat("\nSample size: ", length(cars$weight))
## 
## Sample size:  261
cat("\nStandard deviation: ", sd(cars$weight))
## 
## Standard deviation:  867.9
# Install a package if not installed then load it
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)

cat("\n\nNormal distribution indicator ")
## 
## 
## Normal distribution indicator
p.value <- shapiro.test(cars$weight)$p.value
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
## 
## p-value:  0.00000116
cat("\nIf p-value < 0.05, data are not normally distributed.")
## 
## If p-value < 0.05, data are not normally distributed.

6.2 Perform Transformations

# Min-max normalisation
summary(cars$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      19    2246    2904    3001    3664    4997
mi <- min(cars$weight)
ma <- max(cars$weight)
minmax.weight <- (cars$weight - mi) / (ma - mi)
minmax.weight
##   [1] 0.8417 0.3829 0.6890 0.7517 0.4082 0.7796 0.8726 0.8624 0.7053 0.4080
##  [11] 0.4472 0.4357 0.8280 0.7322 0.5991 0.8720 0.4020 0.6858 0.0000 0.4843
##  [21] 0.4018 0.5185 0.6012 0.8106 0.3915 0.3568 0.3524 0.9317 0.7141 0.4743
##  [31] 0.4241 0.6547 0.4996 0.5991 0.3949 0.6822 0.3648 0.5426 0.7907 0.6521
##  [41] 0.7790 0.9279 0.7284 0.7033 0.6161 0.5255 0.7505 0.4773 0.3859 0.6109
##  [51] 0.8929 0.6370 0.7212 0.4297 0.8761 0.8556 0.5848 0.4130 0.3853 0.7638
##  [61] 0.5878 0.4741 0.5808 0.4200 0.8473 0.7043 0.3879 0.3909 0.3628 0.7756
##  [71] 0.6193 0.5265 0.4558 0.5074 0.4419 0.6018 0.6159 0.5014 0.4512 0.4683
##  [81] 0.8650 0.9910 0.6842 0.5374 0.4421 0.3202 0.5534 0.6922 0.4231 0.5426
##  [91] 0.3929 0.4582 0.8198 0.6649 0.4492 0.7810 0.9000 0.6268 0.7656 0.7557
## [101] 1.0000 0.9817 0.4233 0.4381 0.4512 0.5255 0.4652 0.4110 0.5327 0.7001
## [111] 0.4540 0.5854 0.8670 0.3568 0.4241 0.7565 0.5466 0.6067 0.4070 0.8682
## [121] 0.4385 0.3905 0.9339 0.5476 0.3646 0.5115 0.9311 0.4442 0.5149 0.3578
## [131] 0.4582 0.5342 0.9908 0.7338 0.8881 0.8379 0.3889 0.6328 0.5878 0.5946
## [141] 0.5386 0.4323 0.5315 0.3497 0.7204 0.5902 0.6712 0.5456 0.7380 0.6400
## [151] 0.5068 0.5104 0.8851 0.6306 0.4912 0.6595 0.4229 0.4251 0.5253 0.6812
## [161] 0.5994 0.8158 0.5956 0.5796 0.4490 0.6722 0.6870 0.5155 0.5601 0.4231
## [171] 0.4562 0.5647 0.5991 0.5135 0.4434 0.4359 0.8771 0.7338 0.7133 0.4070
## [181] 0.5968 0.4271 0.8429 0.6864 0.4773 0.6129 0.6910 0.7877 0.8256 0.9474
## [191] 0.5225 0.5325 0.8845 0.8162 0.8152 0.4269 0.4964 0.3949 0.3853 0.3869
## [201] 0.4474 0.5263 0.6832 0.7254 0.4785 0.7696 0.4200 0.6031 0.8194 0.5787
## [211] 0.5617 0.6882 0.3487 0.4128 0.6456 0.5215 0.3929 0.8429 0.5175 0.8590
## [221] 0.6376 0.5577 0.8915 0.7636 0.8082 0.3986 0.9233 0.3274 0.5159 0.5566
## [231] 0.7119 0.5235 0.5169 0.8998 0.6290 0.4241 0.8190 0.5918 0.3728 0.4421
## [241] 0.8108 0.6231 0.8439 0.5329 0.8708 0.6790 0.7465 0.4150 0.4291 0.5034
## [251] 0.4221 0.5305 0.5888 0.7973 0.5436 0.4727 0.7676 0.3578 0.5657 0.6567
## [261] 0.6697
# Z-score standardisation
m <- mean(cars$weight)
s <- sd(cars$weight)
z.weight <- (cars$weight - m) / s
z.weight
##   [1]  1.391790 -1.239876  0.516105  0.875597 -1.094697  1.035755  1.569232
##   [8]  1.510469  0.609434 -1.095849 -0.871166 -0.936843  1.313440  0.763831
##  [15]  0.000000  1.565775 -1.130416  0.497669 -3.436004 -0.658006 -1.131568
##  [22] -0.462129  0.012585  1.213197 -1.190331 -1.389665 -1.415013  1.907984
##  [29]  0.660132 -0.715617 -1.003672  0.319075 -0.570437  0.000000 -1.170743
##  [36]  0.476929 -1.343576 -0.323863  1.099127  0.304096  1.032298  1.886092
##  [43]  0.741939  0.597912  0.097849 -0.421801  0.868683 -0.698334 -1.222593
##  [50]  0.067892  1.685606  0.217680  0.700459 -0.971409  1.588820  1.471294
##  [57] -0.081897 -1.067044 -1.226050  0.944730 -0.064614 -0.716769 -0.104941
##  [64] -1.026716  1.424053  0.603673 -1.211071 -1.193788 -1.355098  1.012711
##  [71]  0.116285 -0.416040 -0.821621 -0.525501 -0.901124  0.016042  0.096697
##  [78] -0.560067 -0.848122 -0.750183  1.525448  2.247889  0.488451 -0.353820
##  [85] -0.899972 -1.599368 -0.261643  0.534540 -1.009433 -0.323863 -1.182265
##  [92] -0.807794  1.266199  0.377838 -0.859644  1.043821  1.725934  0.158917
##  [99]  0.955100  0.898641  2.299739  2.194887 -1.008280 -0.923016 -0.848122
## [106] -0.421801 -0.767467 -1.078566 -0.380321  0.579477 -0.831991 -0.078440
## [113]  1.536970 -1.389665 -1.003672  0.903250 -0.300818  0.043695 -1.101610
## [120]  1.543883 -0.920712 -1.196092  1.920659 -0.295057 -1.344728 -0.502457
## [127]  1.904528 -0.888450 -0.482869 -1.383904 -0.807794 -0.372256  2.246736
## [134]  0.773049  1.657953  1.369898 -1.205310  0.193483 -0.064614 -0.025438
## [141] -0.346907 -0.956431 -0.387235 -1.429992  0.695851 -0.050787  0.413557
## [148] -0.306579  0.797246  0.234963 -0.528958 -0.508218  1.640670  0.180809
## [155] -0.618831  0.346728 -1.010585 -0.997910 -0.422953  0.471168  0.002215
## [162]  1.243154 -0.019677 -0.111855 -0.860796  0.419318  0.504582 -0.479412
## [169] -0.223620 -1.009433 -0.819317 -0.197119  0.000000 -0.490934 -0.893059
## [176] -0.935691  1.594581  0.773049  0.655523 -1.101610 -0.012764 -0.986388
## [183]  1.398704  0.501126 -0.698334  0.079414  0.527627  1.081844  1.299613
## [190]  1.997857 -0.439085 -0.381474  1.637213  1.245459  1.239698 -0.987540
## [197] -0.588873 -1.170743 -1.226050 -1.216832 -0.870014 -0.417192  0.482690
## [204]  0.724656 -0.691420  0.978144 -1.026716  0.022955  1.263894 -0.116463
## [211] -0.214402  0.511496 -1.435753 -1.068196  0.267225 -0.444846 -1.182265
## [218]  1.398704 -0.467890  1.490881  0.221137 -0.237446  1.677541  0.943578
## [225]  1.199370 -1.150003  1.859591 -1.557889 -0.477108 -0.243207  0.647457
## [232] -0.433323 -0.471347  1.724782  0.171591 -1.003672  1.261590 -0.041569
## [239] -1.297487 -0.899972  1.214349  0.138177  1.404465 -0.379169  1.558862
## [246]  0.458494  0.845639 -1.055521 -0.974866 -0.548545 -1.015194 -0.392996
## [253] -0.058852  1.137150 -0.318102 -0.724835  0.966622 -1.383904 -0.191358
## [260]  0.330597  0.405492
length(cars$weight)
## [1] 261
# Decimal scaling
max(abs(cars$weight))   # 4 digits
## [1] 4997
d.weight <- cars$weight / 10^4
d.weight
##   [1] 0.4209 0.1925 0.3449 0.3761 0.2051 0.3900 0.4363 0.4312 0.3530 0.2050
##  [11] 0.2245 0.2188 0.4141 0.3664 0.3001 0.4360 0.2020 0.3433 0.0019 0.2430
##  [21] 0.2019 0.2600 0.3012 0.4054 0.1968 0.1795 0.1773 0.4657 0.3574 0.2380
##  [31] 0.2130 0.3278 0.2506 0.3001 0.1985 0.3415 0.1835 0.2720 0.3955 0.3265
##  [41] 0.3897 0.4638 0.3645 0.3520 0.3086 0.2635 0.3755 0.2395 0.1940 0.3060
##  [51] 0.4464 0.3190 0.3609 0.2158 0.4380 0.4278 0.2930 0.2075 0.1937 0.3821
##  [61] 0.2945 0.2379 0.2910 0.2110 0.4237 0.3525 0.1950 0.1965 0.1825 0.3880
##  [71] 0.3102 0.2640 0.2288 0.2545 0.2219 0.3015 0.3085 0.2515 0.2265 0.2350
##  [81] 0.4325 0.4952 0.3425 0.2694 0.2220 0.1613 0.2774 0.3465 0.2125 0.2720
##  [91] 0.1975 0.2300 0.4100 0.3329 0.2255 0.3907 0.4499 0.3139 0.3830 0.3781
## [101] 0.4997 0.4906 0.2126 0.2200 0.2265 0.2635 0.2335 0.2065 0.2671 0.3504
## [111] 0.2279 0.2933 0.4335 0.1795 0.2130 0.3785 0.2740 0.3039 0.2045 0.4341
## [121] 0.2202 0.1963 0.4668 0.2745 0.1834 0.2565 0.4654 0.2230 0.2582 0.1800
## [131] 0.2300 0.2678 0.4951 0.3672 0.4440 0.4190 0.1955 0.3169 0.2945 0.2979
## [141] 0.2700 0.2171 0.2665 0.1760 0.3605 0.2957 0.3360 0.2735 0.3693 0.3205
## [151] 0.2542 0.2560 0.4425 0.3158 0.2464 0.3302 0.2124 0.2135 0.2634 0.3410
## [161] 0.3003 0.4080 0.2984 0.2904 0.2254 0.3365 0.3439 0.2585 0.2807 0.2125
## [171] 0.2290 0.2830 0.3001 0.2575 0.2226 0.2189 0.4385 0.3672 0.3570 0.2045
## [181] 0.2990 0.2145 0.4215 0.3436 0.2395 0.3070 0.3459 0.3940 0.4129 0.4735
## [191] 0.2620 0.2670 0.4422 0.4082 0.4077 0.2144 0.2490 0.1985 0.1937 0.1945
## [201] 0.2246 0.2639 0.3420 0.3630 0.2401 0.3850 0.2110 0.3021 0.4098 0.2900
## [211] 0.2815 0.3445 0.1755 0.2074 0.3233 0.2615 0.1975 0.4215 0.2595 0.4295
## [221] 0.3193 0.2795 0.4457 0.3820 0.4042 0.2003 0.4615 0.1649 0.2587 0.2790
## [231] 0.3563 0.2625 0.2592 0.4498 0.3150 0.2130 0.4096 0.2965 0.1875 0.2220
## [241] 0.4055 0.3121 0.4220 0.2672 0.4354 0.3399 0.3735 0.2085 0.2155 0.2525
## [251] 0.2120 0.2660 0.2950 0.3988 0.2725 0.2372 0.3840 0.1800 0.2835 0.3288
## [261] 0.3353

6.3 Plot side-by-side histograms

par(mfrow = c(1,2))

cat("\n\nPlot Histogram of weight and Histogram of z.weight ")
## 
## 
## Plot Histogram of weight and Histogram of z.weight
# Create two histograms
hist(cars$weight, breaks = 20, xlim = c(1000,5000), main = "Histogram of Weight", col = "lightblue", xlab = "Weight", ylab = "Counts")
box(which = "plot", lty = "solid", col = "black")

hist(z.weight, breaks = 20, xlim = c(-2,3), main = "Histogram of Z-score of Weight", col = "pink", xlab = "Z-score of Weight", ylab = "Counts")

box(which = "plot", lty = "solid", col = "black")

6.4 Skewness + Kurtosis

# Install a package if not installed then load it
if(! "moments" %in% installed.packages()) { install.packages("moments", dependencies = TRUE) }
library(moments)

cat("\n\nSkewness and Kurtosis are indicators for the shape of a distribution. ")
## 
## 
## Skewness and Kurtosis are indicators for the shape of a distribution.
cat("If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.\n ")
## If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.
## 
cat("\nSkewness: ", format(skewness(cars$weight), digits = 4, scientific = FALSE))
## 
## Skewness:  0.2686
cat("\nKurtosis: ", format(kurtosis(cars$weight), digits = 4, scientific = FALSE))
## 
## Kurtosis:  2.473

7 Transformations to reach Normality

cat("\n\nOther transformations to potentially reach normality. ")
## 
## 
## Other transformations to potentially reach normality.
sqrt.weight <- sqrt(cars$weight)        # Square root
cat("If p-value < 0.05, data are not normally distributed.")
## If p-value < 0.05, data are not normally distributed.
cat("\n\np-value of sqrt: ", format(shapiro.test(sqrt.weight)$p.value, digits = 4, scientific = FALSE))
## 
## 
## p-value of sqrt:  0.000000002395
ln.weight <- log(cars$weight)       # Natural log
cat("\n\np-value of ln.weight: ", format(shapiro.test(ln.weight)$p.value, digits = 4, scientific = FALSE))
## 
## 
## p-value of ln.weight:  0.00000000000000000000001447
invsqrt.weight <- 1 / sqrt.weight       # Inverse square root
cat("\n\np-value of invsqrt.weight: ", format(shapiro.test(invsqrt.weight)$p.value, digits = 4, scientific = FALSE))
## 
## 
## p-value of invsqrt.weight:  0.00000000000000000000000000000000985

7.1 Histogram with Normal Distribution

par(mfrow = c(1,1))

x <- rnorm(1000000, mean = mean(invsqrt.weight), s = sd(invsqrt.weight))

cat("\n\nPlot Histogram of invsqrt.weight. ")
## 
## 
## Plot Histogram of invsqrt.weight.
hist(invsqrt.weight, 
  breaks = 30, 
  col = "orange",
  prob = TRUE,
  border = "black",
  xlab = "Inverse Square Root of Weight",
  ylab = "Counts",
  main = "Histogram of Inverse Square Root of Weight")

box(which = "plot", 
    lty = "solid", 
    col = "black")

# Overlay with normal density
lines(density(x), col = "red")

7.2 Normal Q-Q Plot

cat("\n\nPlot Q-Q Plot of invsqrt.weight. ")
## 
## 
## Plot Q-Q Plot of invsqrt.weight.
qqnorm(invsqrt.weight, 
       dataX = TRUE,
       col = "red",
       ylim = c(0.02,0.04),
       main = "Normal Q-Q Plot of Inverse Square Root of Weight")
qqline(invsqrt.weight,
       col = "blue",
       datax = FALSE)

8 Create indicator Variables

north_flag <- east_flag <- south_flag <- c(rep(NA,10))
region <- c(rep(c("north", "south", "east", "west"),2), "north", "south")

# Change region variables to indicators
for (i in 1:length(region)) { 
  if(region[i] == "north") north_flag[i] = 1
  else north_flag[i] = 0
  if(region[i] == "south") south_flag[i] = 1
  else south_flag[i] = 0
  if(region[i] == "east") east_flag[i] = 1
  else east_flag[i] = 0
}
region; north_flag; south_flag; east_flag
##  [1] "north" "south" "east"  "west"  "north" "south" "east"  "west"  "north"
## [10] "south"
##  [1] 1 0 0 0 1 0 0 0 1 0
##  [1] 0 1 0 0 0 1 0 0 0 1
##  [1] 0 0 1 0 0 0 1 0 0 0

9 Find Duplicated Records

# For number of duplicate records, use anyDuplicated
anyDuplicated(cars)
## [1] 0
# To examine each record, use duplicated (TRUE/FALSE per record)
duplicated(cars)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Let's duplicate the first record
new.cars <- rbind(cars, cars[1,])

# Check for duplicates
anyDuplicated(new.cars)
## [1] 262
duplicated(new.cars)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE