This chunk imports libraries including functions for data manipulation, and for efficient data reading. Including these libraries ensures a robust set of functions to preprocess and visualise the dataset.
# Loading necessary packages. Being installed if not installed before.
# Markdown Update
if(! "rmarkdown" %in% installed.packages()) { install.packages("rmarkdown", dependencies = TRUE) }
library(rmarkdown)
# Loading other packages
if(! "readxl" %in% installed.packages()) { install.packages("readxl", dependencies = TRUE) }
library(readxl)
if(! "tidyr" %in% installed.packages()) { install.packages("tidyr", dependencies = TRUE) }
library(tidyr)
if(! "ggplot2" %in% installed.packages()) { install.packages("ggplot2", dependencies = TRUE) }
library(ggplot2)
if(! "dplyr" %in% installed.packages()) { install.packages("dplyr", dependencies = TRUE) }
library(dplyr)
if(! "stats" %in% installed.packages()) { install.packages("stats", dependencies = TRUE) }
library(stats)
# More libraries will be loaded as and when required
In this chunk, settings are made to support proper data loading, storing and display.
# Global Settings
options(digits = 4)
options(scipen = 999)
# Make date string
today <- format(as.Date(Sys.time(), tz = "Asia/Singapore"), format = "%y%m%d")
# Setting working Directory
setwd("C:/Users/uwe/OneDrive/Documents/AC UNI-ORG/AB SIM/GDBA/R")
This chunk loads the datasets from CSV files into the variable data using read.csv() for faster performance. Please, ensure that the CSV file is in the working directory, or provide the full path to avoid errors.
# If working directory is set, you only need to include filename of the file you wish to open here
Cars <- read.csv("cars.csv", stringsAsFactors = FALSE)
# Write data to working directory
write.csv(HeightShoe, file = paste(today, "HeightShoe.csv"))
write.csv(CHL, file = paste(today, "CHL4Teams.csv"))
write.csv(Cars, file = paste(today, "Cars.csv"))
This chunk provides summary statistics such as mean, median, min, max, and standard deviation for Cars$weight (in pound) as an example. NA is stripped from the variable before calculation.
cat("\n\nDisplay descriptive statistics of Cars$weight in pound after stripping missing values:\n")
##
##
## Display descriptive statistics of Cars$weight in pound after stripping missing values:
cat("\nMean: ", mean(Cars$weight, na.rm = TRUE))
##
## Mean: 3001
cat("\nMinimum: ", min(Cars$weight, na.rm = TRUE))
##
## Minimum: 19
cat("\nMedian: ", median(Cars$weight, na.rm = TRUE))
##
## Median: 2868
cat("\nMaximum: ", max(Cars$weight, na.rm = TRUE))
##
## Maximum: 4997
cat("\nStandard deviation: ", sd(Cars$weight, na.rm = TRUE))
##
## Standard deviation: 872.9
cat("\nSample size: ", length(Cars$weight))
##
## Sample size: 261
cat("\nMissing Values: ", sum(is.na(Cars$weight)))
##
## Missing Values: 3
This chunk provides more detailed summary statistics for all numerical variables. These statistics help understand the distribution and spread of data, which may inform preprocessing steps such as normalisation.
cat("\n\nShow first parts of car dataset: \n")
##
##
## Show first parts of car dataset:
head(Cars) # Display the first few records of a dataset
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 1 14.0 8 350 165 4209 12 1972 US. 17
## 2 31.9 4 89 71 1925 14 1980 Europe. 7
## 3 17.0 8 302 140 3449 11 1971 US. 14
## 4 15.0 8 400 150 3761 10 1971 US. 16
## 5 30.5 4 98 63 2051 17 1978 US. 8
## 6 23.0 8 350 125 3900 17 1980 US. 10
## weightkg ccm
## 1 1909 5735
## 2 873 1458
## 3 1564 4949
## 4 1706 6555
## 5 930 1606
## 6 1769 5735
cat("\n\nShow column names of Cars: \n")
##
##
## Show column names of Cars:
names(Cars) # Display variable names of a data frame, one kind of data in R
## [1] "mpg" "cylinders" "cubicinches" "hp" "weight"
## [6] "time.to.60" "year" "brand" "l.100km" "weightkg"
## [11] "ccm"
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(Cars, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max
## 1 mpg 261 25.16 33.537 10 16.9 22 29 550
## 2 cylinders 261 5.59 1.7333 3 4 6 8 8
## 3 cubicinches 259 200.92 109.26 68 99.5 156 303 455
## 4 hp 261 106.36 40.5 46 75 95 138 230
## 5 weight 258 3001.1 872.94 19 2245.2 2867.5 3670 4997
## 6 time.to.60 261 15.548 2.9106 8 14 16 17 25
## 7 year 261 1976.8 3.6377 1971 1974 1977 1980 1983
## 8 brand 261
## 9 ... Europe. 48 18.391%
## 10 ... Japan. 51 19.54%
## 11 ... US. 162 62.069%
## 12 l.100km 261 11.383 3.8758 5 8 11 14 24
## 13 weightkg 258 1365.2 387.4 732 1019.8 1300.5 1665 2267
## 14 ccm 259 3292.5 1790.4 1114 1630.5 2556 4965.5 7456
## NA Mode
## 1 0
## 2 0
## 3 0.00766283524904215
## 4 0
## 5 0.0114942528735632
## 6 0
## 7 0
## 8
## 9
## 10
## 11
## 12 0
## 13 0.0114942528735632
## 14 0.00766283524904215
# Show Characteristics of Data Frame
knitr::kable(head(Cars), format = "html", caption = "Sample Data from Cars") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg | cylinders | cubicinches | hp | weight | time.to.60 | year | brand | l.100km | weightkg | ccm |
---|---|---|---|---|---|---|---|---|---|---|
14.0 | 8 | 350 | 165 | 4209 | 12 | 1972 | US. | 17 | 1909 | 5735 |
31.9 | 4 | 89 | 71 | 1925 | 14 | 1980 | Europe. | 7 | 873 | 1458 |
17.0 | 8 | 302 | 140 | 3449 | 11 | 1971 | US. | 14 | 1564 | 4949 |
15.0 | 8 | 400 | 150 | 3761 | 10 | 1971 | US. | 16 | 1706 | 6555 |
30.5 | 4 | 98 | 63 | 2051 | 17 | 1978 | US. | 8 | 930 | 1606 |
23.0 | 8 | 350 | 125 | 3900 | 17 | 1980 | US. | 10 | 1769 | 5735 |
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Generate a nice Summary
print(dfSummary(Cars), method = 'render')
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | ||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | mpg [numeric] |
|
104 distinct values | 261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
2 | cylinders [integer] |
|
|
261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
3 | cubicinches [integer] |
|
74 distinct values | 259 (99.2%) | 2 (0.8%) | |||||||||||||||||||||||||||||||||||
4 | hp [integer] |
|
85 distinct values | 261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
5 | weight [integer] |
|
239 distinct values | 258 (98.9%) | 3 (1.1%) | |||||||||||||||||||||||||||||||||||
6 | time.to.60 [integer] |
|
17 distinct values | 261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
7 | year [integer] |
|
13 distinct values | 261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
8 | brand [character] |
|
|
261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
9 | l.100km [integer] |
|
17 distinct values | 261 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||
10 | weightkg [integer] |
|
225 distinct values | 258 (98.9%) | 3 (1.1%) | |||||||||||||||||||||||||||||||||||
11 | ccm [integer] |
|
74 distinct values | 259 (99.2%) | 2 (0.8%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-19
In this chunk, different operations for splitting datasets are shown.
Cars.rsub <- Cars[1:5,] # Subset the data by rows
cat("\n\nFirst 5 rows of Cars: \n")
##
##
## First 5 rows of Cars:
Cars.rsub
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 1 14.0 8 350 165 4209 12 1972 US. 17
## 2 31.9 4 89 71 1925 14 1980 Europe. 7
## 3 17.0 8 302 140 3449 11 1971 US. 14
## 4 15.0 8 400 150 3761 10 1971 US. 16
## 5 30.5 4 98 63 2051 17 1978 US. 8
## weightkg ccm
## 1 1909 5735
## 2 873 1458
## 3 1564 4949
## 4 1706 6555
## 5 930 1606
Cars.rcsub <- Cars[c(1,3,5), c(2,4)] # Subset by specific rows and columns
cat("\n\nRows 1, 3 and 5 of columns 2 and 4 of Cars: \n")
##
##
## Rows 1, 3 and 5 of columns 2 and 4 of Cars:
Cars.rcsub
## cylinders hp
## 1 8 165
## 3 8 140
## 5 4 63
Cars.vsub <- Cars[which(Cars$mpg > 40),] # Subset by a logical condition
cat("\n\nAll datasets with mpg > 40 of Cars: \n")
##
##
## All datasets with mpg > 40 of Cars:
Cars.vsub
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 20 550.0 4 107 90 2430 15 1971 Europe. 10
## 64 46.6 4 86 65 2110 18 1981 Japan. 5
## 107 43.4 4 90 48 2335 24 1981 Europe. 5
## 196 41.5 4 98 76 2144 15 1981 Europe. 6
## 198 43.1 4 90 48 1985 22 1979 Europe. 5
## 207 40.8 4 85 65 2110 19 1981 Japan. 6
## 236 44.0 4 97 52 2130 25 1983 Europe. 5
## 248 44.3 4 90 48 2085 22 1981 Europe. 5
## weightkg ccm
## 20 1102 1753
## 64 957 1409
## 107 1059 1475
## 196 973 1606
## 198 900 1475
## 207 957 1393
## 236 966 1590
## 248 946 1475
# To declare new variables, type the variable name, a left-arrow, then the value of the variable
weight <- Cars[which(Cars$mpg > 50),]$weight
mpg <- Cars[which(Cars$mpg > 50),]$mpg
cat("\nMiles/Gallon:", mpg)
##
## Miles/Gallon: 550
cat("\nWeight:", weight)
##
## Weight: 2430
# Split the dataset in train and test subset in 70/30 ratio
train_size <- floor(0.70 * nrow(Cars))
set.seed(123)
train_ind <- sample(seq_len(nrow(Cars)), size = train_size)
train <- Cars[train_ind, ]
test <- Cars[-train_ind, ]
In this chunk, different methods of dealing with missing values are conducted on Cars.
# Install a package if not installed then load it
if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)
# Look at four variables from Cars
Cars.4var <- Cars[,c(1, 3, 4, 8)]
cat("\n\nHead of Cars with 4 columns: \n")
##
##
## Head of Cars with 4 columns:
head(Cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 89 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 US.
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Make certain entries missing
Cars.4var[2, 2] <- Cars.4var[4, 4] <- NA
cat("\n\nHead of Cars with 4 columns and NA at 2,2 and 4,4: \n")
##
##
## Head of Cars with 4 columns and NA at 2,2 and 4,4:
head(Cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 NA 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 <NA>
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Replace missing values with constants
Cars.4var[2,2] <- 0
Cars.4var[4,4] <- "Missing"
cat("\n\nHead of Cars with 4 columns and missing values replaced with '0' and 'Missing': \n")
##
##
## Head of Cars with 4 columns and missing values replaced with '0' and 'Missing':
head(Cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 0 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 Missing
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Replace NA with mean
Cars.4var[2,2] <- mean(na.omit(Cars.4var$cubicinches))
cat("\n\nHead of Cars with 2,2 replaced by mean: \n")
##
##
## Head of Cars with 2,2 replaced by mean:
head(Cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350.0 165 US.
## 2 31.9 200.6 71 Europe.
## 3 17.0 302.0 140 US.
## 4 15.0 400.0 150 Missing
## 5 30.5 98.0 63 US.
## 6 23.0 350.0 125 US.
# Replace NA with mode
our_table <- table(Cars.4var$brand)
our_mode <- names(our_table) [our_table == max(our_table)]
Cars.4var[4,4] <- our_mode
cat("\n\nHead of Cars with 4,4 replaced by mode: \n")
##
##
## Head of Cars with 4,4 replaced by mode:
head(Cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350.0 165 US.
## 2 31.9 200.6 71 Europe.
## 3 17.0 302.0 140 US.
## 4 15.0 400.0 150 US.
## 5 30.5 98.0 63 US.
## 6 23.0 350.0 125 US.
# Replace every NA in weight with overall mean
Cars$weight <- na.aggregate(Cars$weight)
# Replace NA and empty with mean in weightkg
meanVal <- mean(Cars$weightkg, na.rm = TRUE)
Cars$weightkg[is.na(Cars$weightkg)] <- meanVal
# Write data to working directory
write.csv(Cars, file = "CarsFixed.csv")
north_flag <- east_flag <- south_flag <- c(rep(NA,10))
region <- c(rep(c("north", "south", "east", "west"),2), "north", "south")
# Change region variables to indicators
for (i in 1:length(region)) {
if(region[i] == "north") north_flag[i] = 1
else north_flag[i] = 0
if(region[i] == "south") south_flag[i] = 1
else south_flag[i] = 0
if(region[i] == "east") east_flag[i] = 1
else east_flag[i] = 0
}
region; north_flag; south_flag; east_flag
## [1] "north" "south" "east" "west" "north" "south" "east" "west" "north"
## [10] "south"
## [1] 1 0 0 0 1 0 0 0 1 0
## [1] 0 1 0 0 0 1 0 0 0 1
## [1] 0 0 1 0 0 0 1 0 0 0
Finding duplicated records can be important to avoid some records to be counted multiple times which can render the resulting model invalid or less accurate for test data.
# For number of duplicate records, use anyDuplicated
duplicated <- anyDuplicated(Cars)
cat("\n\nShow number of duplicated records: ", duplicated, "\n")
##
##
## Show number of duplicated records: 0
# To examine each record, use duplicated (TRUE/FALSE per record)
cat("\n\nShow each record with TRUE if duplicated\n")
##
##
## Show each record with TRUE if duplicated
duplicated(Cars)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Let's duplicate the first record
new.Cars <- rbind(Cars, Cars[1,])
# For number of duplicate records, use anyDuplicated
duplicated <- anyDuplicated(new.Cars)
cat("\n\nShow ID of duplicated records after one record has been duplicated and added: ", duplicated, " \n")
##
##
## Show ID of duplicated records after one record has been duplicated and added: 262
# To examine each record, use duplicated (TRUE/FALSE per record)
cat("\n\nShow each record with TRUE if duplicated\n")
##
##
## Show each record with TRUE if duplicated
duplicated(new.Cars)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
There are multiple ways for detecting outliers. A histogram can be used to search for outliers in one variable, whereas scatter plots indicate outliers in correlated variables.
# Set up the plot area
par(mfrow = c(1,1))
cat("\n\nDisplay histogram of Cars dataset to identify potential outliers: \n")
##
##
## Display histogram of Cars dataset to identify potential outliers:
# Create the histogram bars
hist(Cars$weight,
breaks = 30,
xlim = c(0,5000),
xlab = "Weight",
ylim = c(0,40),
ylab = "Counts",
col = "lightblue",
border = "black",
main = "Histogram of car weights")
# Make a box around the plot
box(which = "plot", lty = "solid", col = "black")
# Set up the plot area
par(mfrow = c(1,1))
cat("\n\nDisplay scatter plot of weight against MPG of Cars dataset to identify potential outliers: \n")
##
##
## Display scatter plot of weight against MPG of Cars dataset to identify potential outliers:
# Create the scatter plot
plot(Cars$weight, Cars$mpg,
xlim = c(0,5000),
xlab = "Weight",
ylim = c(0,600),
ylab = "MPG",
type = "p",
pch = 16,
col = "lightblue",
border = "black",
main = "Scatter Plot of MPG by Weight")
# Add open black circles
points(Cars$weight,Cars$mpg, type = "p", col = "black")
Box plots present another way to detect outliers in one variable. Potential outliers are indicated in the box plot as distinct points outside the box and whisker plot.
# Install a package if not installed then load it
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
if(! "gtsummary" %in% installed.packages()) { install.packages("gtsummary", dependencies = TRUE) }
library(gtsummary)
# Display boxplot
cat("\n\nCHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.")
##
##
## CHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.
# Show Descriptive Stats
cat("\n\nShow descriptive stats to identify potential outliers and missing values in dataset CHL: \n")
##
##
## Show descriptive stats to identify potential outliers and missing values in dataset CHL:
tbl_summary(CHL,
by = Team,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c("{mean} ({sd})","{median} ({IQR})", "{min}- {max}"), ) %>%
add_stat_label(label = Time.in.min ~ c("Mean (SD)","Median (Inter Quant. Range)", "Min- Max"))
Characteristic | Green N = 29 |
Orange N = 39 |
Red N = 38 |
Yellow N = 32 |
---|---|---|---|---|
X | ||||
Mean (SD) | 15 (9) | 49 (11) | 88 (11) | 123 (9) |
Median (IQR) | 15 (14) | 49 (19) | 88 (19) | 123 (16) |
Min- Max | 1- 29 | 30- 68 | 69- 106 | 107- 138 |
Time.in.min | ||||
Mean (SD) | 10.1 (3.2) | 6.4 (6.3) | 8.1 (3.0) | 8.3 (2.5) |
Median (Inter Quant. Range) | 10.2 (3.4) | 5.0 (3.5) | 7.5 (2.5) | 8.5 (2.5) |
Min- Max | 0.3- 16.5 | 1.3- 42.0 | 3.6- 19.2 | 3.5- 13.4 |
Unknown | 0 | 0 | 1 | 2 |
cat("\n\nShow boxplot to identify potential outliers of dataset CHL: \n")
##
##
## Show boxplot to identify potential outliers of dataset CHL:
boxplot(Time.in.min ~ Team, data = CHL, col=(c("lightgreen", "orange", "pink", "yellow")), main="Delivery Time by Team", xlab = "Team", ylab = "Delivery time", ylim = c(0, 50), varwidth = TRUE, medcol = "red", boxlty = 0, whisklty = 1, staplelwd = 4, outpch = 24, outcex = 3, outbg = "lightgrey")
# Add grid
grid(nx = NA, ny = NULL, 5, lwd = 1, col = "lightgrey") # grid only in y-direction
If the range of variables in one dataframe is very different, it might be appropriate to transform all scales to have a similar range. This range is often -1 to 1, 0 to 1 or any other range as desired.
# Min-max normalisation
summary(Cars$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19 2246 2904 3001 3664 4997
mi <- min(Cars$weight)
ma <- max(Cars$weight)
minmax.weight <- (Cars$weight - mi) / (ma - mi)
cat("\n\nThe dataset of Cars$weight after min-max-transformation is shown:\n")
##
##
## The dataset of Cars$weight after min-max-transformation is shown:
minmax.weight
## [1] 0.8417 0.3829 0.6890 0.7517 0.4082 0.7796 0.8726 0.8624 0.7053 0.4080
## [11] 0.4472 0.4357 0.8280 0.7322 0.5991 0.8720 0.4020 0.6858 0.0000 0.4843
## [21] 0.4018 0.5185 0.6012 0.8106 0.3915 0.3568 0.3524 0.9317 0.7141 0.4743
## [31] 0.4241 0.6547 0.4996 0.5991 0.3949 0.6822 0.3648 0.5426 0.7907 0.6521
## [41] 0.7790 0.9279 0.7284 0.7033 0.6161 0.5255 0.7505 0.4773 0.3859 0.6109
## [51] 0.8929 0.6370 0.7212 0.4297 0.8761 0.8556 0.5848 0.4130 0.3853 0.7638
## [61] 0.5878 0.4741 0.5808 0.4200 0.8473 0.7043 0.3879 0.3909 0.3628 0.7756
## [71] 0.6193 0.5265 0.4558 0.5074 0.4419 0.6018 0.6159 0.5014 0.4512 0.4683
## [81] 0.8650 0.9910 0.6842 0.5374 0.4421 0.3202 0.5534 0.6922 0.4231 0.5426
## [91] 0.3929 0.4582 0.8198 0.6649 0.4492 0.7810 0.9000 0.6268 0.7656 0.7557
## [101] 1.0000 0.9817 0.4233 0.4381 0.4512 0.5255 0.4652 0.4110 0.5327 0.7001
## [111] 0.4540 0.5854 0.8670 0.3568 0.4241 0.7565 0.5466 0.6067 0.4070 0.8682
## [121] 0.4385 0.3905 0.9339 0.5476 0.3646 0.5115 0.9311 0.4442 0.5149 0.3578
## [131] 0.4582 0.5342 0.9908 0.7338 0.8881 0.8379 0.3889 0.6328 0.5878 0.5946
## [141] 0.5386 0.4323 0.5315 0.3497 0.7204 0.5902 0.6712 0.5456 0.7380 0.6400
## [151] 0.5068 0.5104 0.8851 0.6306 0.4912 0.6595 0.4229 0.4251 0.5253 0.6812
## [161] 0.5994 0.8158 0.5956 0.5796 0.4490 0.6722 0.6870 0.5155 0.5601 0.4231
## [171] 0.4562 0.5647 0.5991 0.5135 0.4434 0.4359 0.8771 0.7338 0.7133 0.4070
## [181] 0.5968 0.4271 0.8429 0.6864 0.4773 0.6129 0.6910 0.7877 0.8256 0.9474
## [191] 0.5225 0.5325 0.8845 0.8162 0.8152 0.4269 0.4964 0.3949 0.3853 0.3869
## [201] 0.4474 0.5263 0.6832 0.7254 0.4785 0.7696 0.4200 0.6031 0.8194 0.5787
## [211] 0.5617 0.6882 0.3487 0.4128 0.6456 0.5215 0.3929 0.8429 0.5175 0.8590
## [221] 0.6376 0.5577 0.8915 0.7636 0.8082 0.3986 0.9233 0.3274 0.5159 0.5566
## [231] 0.7119 0.5235 0.5169 0.8998 0.6290 0.4241 0.8190 0.5918 0.3728 0.4421
## [241] 0.8108 0.6231 0.8439 0.5329 0.8708 0.6790 0.7465 0.4150 0.4291 0.5034
## [251] 0.4221 0.5305 0.5888 0.7973 0.5436 0.4727 0.7676 0.3578 0.5657 0.6567
## [261] 0.6697
# Z-score standardisation
m <- mean(Cars$weight)
s <- sd(Cars$weight)
z.weight <- (Cars$weight - m) / s
cat("\n\nThe dataset of Cars$weight after z-score-transformation is shown:\n")
##
##
## The dataset of Cars$weight after z-score-transformation is shown:
z.weight
## [1] 1.391790 -1.239876 0.516105 0.875597 -1.094697 1.035755 1.569232
## [8] 1.510469 0.609434 -1.095849 -0.871166 -0.936843 1.313440 0.763831
## [15] 0.000000 1.565775 -1.130416 0.497669 -3.436004 -0.658006 -1.131568
## [22] -0.462129 0.012585 1.213197 -1.190331 -1.389665 -1.415013 1.907984
## [29] 0.660132 -0.715617 -1.003672 0.319075 -0.570437 0.000000 -1.170743
## [36] 0.476929 -1.343576 -0.323863 1.099127 0.304096 1.032298 1.886092
## [43] 0.741939 0.597912 0.097849 -0.421801 0.868683 -0.698334 -1.222593
## [50] 0.067892 1.685606 0.217680 0.700459 -0.971409 1.588820 1.471294
## [57] -0.081897 -1.067044 -1.226050 0.944730 -0.064614 -0.716769 -0.104941
## [64] -1.026716 1.424053 0.603673 -1.211071 -1.193788 -1.355098 1.012711
## [71] 0.116285 -0.416040 -0.821621 -0.525501 -0.901124 0.016042 0.096697
## [78] -0.560067 -0.848122 -0.750183 1.525448 2.247889 0.488451 -0.353820
## [85] -0.899972 -1.599368 -0.261643 0.534540 -1.009433 -0.323863 -1.182265
## [92] -0.807794 1.266199 0.377838 -0.859644 1.043821 1.725934 0.158917
## [99] 0.955100 0.898641 2.299739 2.194887 -1.008280 -0.923016 -0.848122
## [106] -0.421801 -0.767467 -1.078566 -0.380321 0.579477 -0.831991 -0.078440
## [113] 1.536970 -1.389665 -1.003672 0.903250 -0.300818 0.043695 -1.101610
## [120] 1.543883 -0.920712 -1.196092 1.920659 -0.295057 -1.344728 -0.502457
## [127] 1.904528 -0.888450 -0.482869 -1.383904 -0.807794 -0.372256 2.246736
## [134] 0.773049 1.657953 1.369898 -1.205310 0.193483 -0.064614 -0.025438
## [141] -0.346907 -0.956431 -0.387235 -1.429992 0.695851 -0.050787 0.413557
## [148] -0.306579 0.797246 0.234963 -0.528958 -0.508218 1.640670 0.180809
## [155] -0.618831 0.346728 -1.010585 -0.997910 -0.422953 0.471168 0.002215
## [162] 1.243154 -0.019677 -0.111855 -0.860796 0.419318 0.504582 -0.479412
## [169] -0.223620 -1.009433 -0.819317 -0.197119 0.000000 -0.490934 -0.893059
## [176] -0.935691 1.594581 0.773049 0.655523 -1.101610 -0.012764 -0.986388
## [183] 1.398704 0.501126 -0.698334 0.079414 0.527627 1.081844 1.299613
## [190] 1.997857 -0.439085 -0.381474 1.637213 1.245459 1.239698 -0.987540
## [197] -0.588873 -1.170743 -1.226050 -1.216832 -0.870014 -0.417192 0.482690
## [204] 0.724656 -0.691420 0.978144 -1.026716 0.022955 1.263894 -0.116463
## [211] -0.214402 0.511496 -1.435753 -1.068196 0.267225 -0.444846 -1.182265
## [218] 1.398704 -0.467890 1.490881 0.221137 -0.237446 1.677541 0.943578
## [225] 1.199370 -1.150003 1.859591 -1.557889 -0.477108 -0.243207 0.647457
## [232] -0.433323 -0.471347 1.724782 0.171591 -1.003672 1.261590 -0.041569
## [239] -1.297487 -0.899972 1.214349 0.138177 1.404465 -0.379169 1.558862
## [246] 0.458494 0.845639 -1.055521 -0.974866 -0.548545 -1.015194 -0.392996
## [253] -0.058852 1.137150 -0.318102 -0.724835 0.966622 -1.383904 -0.191358
## [260] 0.330597 0.405492
# Decimal scaling
# max(abs(Cars$weight)) # 4 digits
d.weight <- Cars$weight / 10^4
cat("\n\nThe dataset of Cars$weight after decimal transformation is shown:\n")
##
##
## The dataset of Cars$weight after decimal transformation is shown:
d.weight
## [1] 0.4209 0.1925 0.3449 0.3761 0.2051 0.3900 0.4363 0.4312 0.3530 0.2050
## [11] 0.2245 0.2188 0.4141 0.3664 0.3001 0.4360 0.2020 0.3433 0.0019 0.2430
## [21] 0.2019 0.2600 0.3012 0.4054 0.1968 0.1795 0.1773 0.4657 0.3574 0.2380
## [31] 0.2130 0.3278 0.2506 0.3001 0.1985 0.3415 0.1835 0.2720 0.3955 0.3265
## [41] 0.3897 0.4638 0.3645 0.3520 0.3086 0.2635 0.3755 0.2395 0.1940 0.3060
## [51] 0.4464 0.3190 0.3609 0.2158 0.4380 0.4278 0.2930 0.2075 0.1937 0.3821
## [61] 0.2945 0.2379 0.2910 0.2110 0.4237 0.3525 0.1950 0.1965 0.1825 0.3880
## [71] 0.3102 0.2640 0.2288 0.2545 0.2219 0.3015 0.3085 0.2515 0.2265 0.2350
## [81] 0.4325 0.4952 0.3425 0.2694 0.2220 0.1613 0.2774 0.3465 0.2125 0.2720
## [91] 0.1975 0.2300 0.4100 0.3329 0.2255 0.3907 0.4499 0.3139 0.3830 0.3781
## [101] 0.4997 0.4906 0.2126 0.2200 0.2265 0.2635 0.2335 0.2065 0.2671 0.3504
## [111] 0.2279 0.2933 0.4335 0.1795 0.2130 0.3785 0.2740 0.3039 0.2045 0.4341
## [121] 0.2202 0.1963 0.4668 0.2745 0.1834 0.2565 0.4654 0.2230 0.2582 0.1800
## [131] 0.2300 0.2678 0.4951 0.3672 0.4440 0.4190 0.1955 0.3169 0.2945 0.2979
## [141] 0.2700 0.2171 0.2665 0.1760 0.3605 0.2957 0.3360 0.2735 0.3693 0.3205
## [151] 0.2542 0.2560 0.4425 0.3158 0.2464 0.3302 0.2124 0.2135 0.2634 0.3410
## [161] 0.3003 0.4080 0.2984 0.2904 0.2254 0.3365 0.3439 0.2585 0.2807 0.2125
## [171] 0.2290 0.2830 0.3001 0.2575 0.2226 0.2189 0.4385 0.3672 0.3570 0.2045
## [181] 0.2990 0.2145 0.4215 0.3436 0.2395 0.3070 0.3459 0.3940 0.4129 0.4735
## [191] 0.2620 0.2670 0.4422 0.4082 0.4077 0.2144 0.2490 0.1985 0.1937 0.1945
## [201] 0.2246 0.2639 0.3420 0.3630 0.2401 0.3850 0.2110 0.3021 0.4098 0.2900
## [211] 0.2815 0.3445 0.1755 0.2074 0.3233 0.2615 0.1975 0.4215 0.2595 0.4295
## [221] 0.3193 0.2795 0.4457 0.3820 0.4042 0.2003 0.4615 0.1649 0.2587 0.2790
## [231] 0.3563 0.2625 0.2592 0.4498 0.3150 0.2130 0.4096 0.2965 0.1875 0.2220
## [241] 0.4055 0.3121 0.4220 0.2672 0.4354 0.3399 0.3735 0.2085 0.2155 0.2525
## [251] 0.2120 0.2660 0.2950 0.3988 0.2725 0.2372 0.3840 0.1800 0.2835 0.3288
## [261] 0.3353
After transformation, the original data and transformed data are plotted side-by-side to show the effect. The above mentioned transformations do not change the shape of the data.
par(mfrow = c(1,2))
cat("\n\nPlot Histogram of weight and Histogram of z.weight ")
##
##
## Plot Histogram of weight and Histogram of z.weight
# Create two histograms
hist(Cars$weight, breaks = 20, xlim = c(1000,5000), main = "Histogram of Weight", col = "lightblue", xlab = "Weight", ylab = "Counts")
box(which = "plot", lty = "solid", col = "black")
hist(z.weight, breaks = 20, xlim = c(-2,3), main = "Histogram of Z-score of Weight", col = "pink", xlab = "Z-score of Weight", ylab = "Counts")
box(which = "plot", lty = "solid", col = "black")
Kurtosis and skewness are indicators for deviations from normality.
# Install a package if not installed then load it
if(! "moments" %in% installed.packages()) { install.packages("moments", dependencies = TRUE) }
library(moments)
cat("\n\nSkewness and Kurtosis are indicators for the shape of a distribution. ")
##
##
## Skewness and Kurtosis are indicators for the shape of a distribution.
cat("If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.\n ")
## If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.
##
cat("\nSkewness: ", format(skewness(Cars$weight), digits = 4, scientific = FALSE))
##
## Skewness: 0.2686
cat("\nKurtosis: ", format(kurtosis(Cars$weight), digits = 4, scientific = FALSE))
##
## Kurtosis: 2.473
Although inspecting the shape of a data distribution gives a relatively good indication of its characteristics, it is difficult to say for sure whether a dataset shows a normal distribution, especially for small sample sizes. Therefore, a normality test, like Shapirow-Wilk test, can be applied.
cat("\n\nNormal distribution indicator for Cars$weight: ")
##
##
## Normal distribution indicator for Cars$weight:
p.value <- shapiro.test(Cars$weight)$p.value
cat("\np-value: ", format(p.value, digits = 4, scientific = FALSE))
##
## p-value: 0.00000116
cat("\nIf p-value < 0.05, data are not normally distributed.")
##
## If p-value < 0.05, data are not normally distributed.
It is of advantage for modelling applications to have normally distributed data. Under some circumstances, contonuous data can be transformed to approach normality as shown here for Cars$weight.
cat("\n\nOther transformations to potentially reach normality. ")
##
##
## Other transformations to potentially reach normality.
sqrt.weight <- sqrt(Cars$weight) # Square root
cat("If p-value < 0.05, data are not normally distributed.")
## If p-value < 0.05, data are not normally distributed.
cat("\n\np-value of sqrt: ", format(shapiro.test(sqrt.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of sqrt: 0.000000002395
ln.weight <- log(Cars$weight) # Natural log
cat("\n\np-value of ln.weight: ", format(shapiro.test(ln.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of ln.weight: 0.00000000000000000000001447
invsqrt.weight <- 1 / sqrt.weight # Inverse square root
cat("\n\np-value of invsqrt.weight: ", format(shapiro.test(invsqrt.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of invsqrt.weight: 0.00000000000000000000000000000000985
Plotting a histogram with overlaid normal curve might give an indication of normality.
par(mfrow = c(1,1))
x <- rnorm(1000000, mean = mean(invsqrt.weight), s = sd(invsqrt.weight))
cat("\n\nPlot Histogram of invsqrt.weight. ")
##
##
## Plot Histogram of invsqrt.weight.
hist(invsqrt.weight,
breaks = 30,
col = "orange",
prob = TRUE,
border = "black",
xlab = "Inverse Square Root of Weight",
ylab = "Counts",
main = "Histogram of Inverse Square Root of Weight")
box(which = "plot",
lty = "solid",
col = "black")
# Overlay with normal density
lines(density(x), col = "red")
cat("\n\nPlot Q-Q Plot of invsqrt.weight. ")
##
##
## Plot Q-Q Plot of invsqrt.weight.
qqnorm(invsqrt.weight,
dataX = TRUE,
col = "red",
ylim = c(0.02,0.04),
main = "Normal Q-Q Plot of Inverse Square Root of Weight")
qqline(invsqrt.weight,
col = "blue",
datax = FALSE)