Lesson 01
Setting Work
Directory
# If working directory is set, you only need to include filename of the file you wish to open here
cars <- read.csv("cars.csv", stringsAsFactors = FALSE)
# Write data to working directory
To display the whole
dataset, type the dataset name
cat("\n\nShow first parts of car dataset: \n")
##
##
## Show first parts of car dataset:
head(cars) # Display the first few records of a dataset
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 1 14.0 8 350 165 4209 12 1972 US. 17
## 2 31.9 4 89 71 1925 14 1980 Europe. 7
## 3 17.0 8 302 140 3449 11 1971 US. 14
## 4 15.0 8 400 150 3761 10 1971 US. 16
## 5 30.5 4 98 63 2051 17 1978 US. 8
## 6 23.0 8 350 125 3900 17 1980 US. 10
## weightkg ccm
## 1 1909 5735
## 2 873 1458
## 3 1564 4949
## 4 1706 6555
## 5 930 1606
## 6 1769 5735
cat("\n\nShow column names of cars: \n")
##
##
## Show column names of cars:
names(cars) # Display variable names of a data frame, one kind of data in R
## [1] "mpg" "cylinders" "cubicinches" "hp" "weight"
## [6] "time.to.60" "year" "brand" "l.100km" "weightkg"
## [11] "ccm"
# Loading other packages if not available
if(! "vtable" %in% installed.packages()) { install.packages("vtable", dependencies = TRUE) }
library(vtable)
cat("\n\nDescriptive Statistics of Columns in Data Frame:\n")
##
##
## Descriptive Statistics of Columns in Data Frame:
st(cars, add.median = TRUE, out = "csv", simple.kable = TRUE, col.align = "right", align = "right", digits = 5,
title='Summary Statistics',
summ = list(
c('notNA(x)','mean(x)','sd(x)','min(x)', 'pctile(x)[25]', 'median(x)', 'pctile(x)[75]', 'max(x)', 'propNA(x)', 'getmode(x)'),
c('notNA(x)','mean(x)')
),
summ.names = list(
c('N','Mean','SD','Min','P25','P50','P75', 'Max','NA','Mode'),
c('Count','Percent')
)
)
## Variable N Mean SD Min P25 P50 P75 Max
## 1 mpg 261 25.16 33.537 10 16.9 22 29 550
## 2 cylinders 261 5.59 1.7333 3 4 6 8 8
## 3 cubicinches 259 200.92 109.26 68 99.5 156 303 455
## 4 hp 261 106.36 40.5 46 75 95 138 230
## 5 weight 258 3001.1 872.94 19 2245.2 2867.5 3670 4997
## 6 time.to.60 261 15.548 2.9106 8 14 16 17 25
## 7 year 261 1976.8 3.6377 1971 1974 1977 1980 1983
## 8 brand 261
## 9 ... Europe. 48 18.391%
## 10 ... Japan. 51 19.54%
## 11 ... US. 162 62.069%
## 12 l.100km 261 11.383 3.8758 5 8 11 14 24
## 13 weightkg 258 1365.2 387.4 732 1019.8 1300.5 1665 2267
## 14 ccm 259 3292.5 1790.4 1114 1630.5 2556 4965.5 7456
## NA Mode
## 1 0
## 2 0
## 3 0.00766283524904215
## 4 0
## 5 0.0114942528735632
## 6 0
## 7 0
## 8
## 9
## 10
## 11
## 12 0
## 13 0.0114942528735632
## 14 0.00766283524904215
# Show Characteristics of Data Frame
knitr::kable(head(cars), format = "html", caption = "Sample Data from cars") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sample Data from cars
mpg
|
cylinders
|
cubicinches
|
hp
|
weight
|
time.to.60
|
year
|
brand
|
l.100km
|
weightkg
|
ccm
|
14.0
|
8
|
350
|
165
|
4209
|
12
|
1972
|
US.
|
17
|
1909
|
5735
|
31.9
|
4
|
89
|
71
|
1925
|
14
|
1980
|
Europe.
|
7
|
873
|
1458
|
17.0
|
8
|
302
|
140
|
3449
|
11
|
1971
|
US.
|
14
|
1564
|
4949
|
15.0
|
8
|
400
|
150
|
3761
|
10
|
1971
|
US.
|
16
|
1706
|
6555
|
30.5
|
4
|
98
|
63
|
2051
|
17
|
1978
|
US.
|
8
|
930
|
1606
|
23.0
|
8
|
350
|
125
|
3900
|
17
|
1980
|
US.
|
10
|
1769
|
5735
|
# Loading other packages if not available
if(! "summarytools" %in% installed.packages()) { install.packages("summarytools", dependencies = TRUE) }
library(summarytools)
# Generate a nice Summary
print(dfSummary(cars), method = 'render')
Data Frame Summary
cars
Dimensions: 261 x 11
Duplicates: 0
Generated by summarytools 1.0.1 (R version 4.4.1)
2024-11-18
Create Matrices
# Create a matrix with three rows, two columns, and every value equal to 0.0
mat <- matrix(0.0, nrow = 3, ncol = 2);
cat("\n\nMatrix with 3 rows and 2 columns: \n")
##
##
## Matrix with 3 rows and 2 columns:
mat
## [,1] [,2]
## [1,] 0 0
## [2,] 0 0
## [3,] 0 0
colnames(mat) <- c("Var1", "Var2") # Give a matrix variable names
cat("\n\nVariable names of the matrix: \n")
##
##
## Variable names of the matrix:
colnames(mat) # Display variable names of a matrix
## [1] "Var1" "Var2"
Subset data and
declare new variables
cars.rsub <- cars[1:5,] # Subset the data by rows
cat("\n\nFirst 5 rows of cars: \n")
##
##
## First 5 rows of cars:
cars.rsub
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 1 14.0 8 350 165 4209 12 1972 US. 17
## 2 31.9 4 89 71 1925 14 1980 Europe. 7
## 3 17.0 8 302 140 3449 11 1971 US. 14
## 4 15.0 8 400 150 3761 10 1971 US. 16
## 5 30.5 4 98 63 2051 17 1978 US. 8
## weightkg ccm
## 1 1909 5735
## 2 873 1458
## 3 1564 4949
## 4 1706 6555
## 5 930 1606
cars.rcsub <- cars[c(1,3,5), c(2,4)] # Subset by specific rows and columns
cat("\n\nRows 1, 3 and 5 of columns 2 and 4 of cars: \n")
##
##
## Rows 1, 3 and 5 of columns 2 and 4 of cars:
cars.rcsub
## cylinders hp
## 1 8 165
## 3 8 140
## 5 4 63
cars.vsub <- cars[which(cars$mpg > 40),] # Subset by a logical condition
cat("\n\nAll datasets with mpg > 40 of cars: \n")
##
##
## All datasets with mpg > 40 of cars:
cars.vsub
## mpg cylinders cubicinches hp weight time.to.60 year brand l.100km
## 20 550.0 4 107 90 2430 15 1971 Europe. 10
## 64 46.6 4 86 65 2110 18 1981 Japan. 5
## 107 43.4 4 90 48 2335 24 1981 Europe. 5
## 196 41.5 4 98 76 2144 15 1981 Europe. 6
## 198 43.1 4 90 48 1985 22 1979 Europe. 5
## 207 40.8 4 85 65 2110 19 1981 Japan. 6
## 236 44.0 4 97 52 2130 25 1983 Europe. 5
## 248 44.3 4 90 48 2085 22 1981 Europe. 5
## weightkg ccm
## 20 1102 1753
## 64 957 1409
## 107 1059 1475
## 196 973 1606
## 198 900 1475
## 207 957 1393
## 236 966 1590
## 248 946 1475
# To declare new variables, type the variable name, a left-arrow, then the value of the variable
weight <- cars[which(cars$mpg > 50),]$weight
mpg <- cars[which(cars$mpg > 50),]$mpg
cat("\nMiles/Gallon:", mpg)
##
## Miles/Gallon: 550
cat("\nWeight:", weight)
##
## Weight: 2430
# Display more than one figure at a time
par(mfrow = c(1,1)) # plots one figure; the default setting
par(mfrow = c(2,3)) # plots six figures: three in the top row, three in the bottom row
# Plots will fill the plot space row by row
Lesson 2
Deal with Missing
Data
# Install a package if not installed then load it
if(! "zoo" %in% installed.packages()) { install.packages("zoo", dependencies = TRUE) }
library(zoo)
# Look at four variables from cars
cars.4var <- cars[,c(1, 3, 4, 8)]
cat("\n\nHead of cars with 4 columns: \n")
##
##
## Head of cars with 4 columns:
head(cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 89 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 US.
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Make certain entries missing
cars.4var[2, 2] <- cars.4var[4, 4] <- NA
cat("\n\nHead of cars with 4 columns and NA at 2,2 and 4,4: \n")
##
##
## Head of cars with 4 columns and NA at 2,2 and 4,4:
head(cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 NA 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 <NA>
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Replace missing values with constants
cars.4var[2,2] <- 0
cars.4var[4,4] <- "Missing"
cat("\n\nHead of cars with 4 columns and missing values replaced with '0' and 'Missing': \n")
##
##
## Head of cars with 4 columns and missing values replaced with '0' and 'Missing':
head(cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350 165 US.
## 2 31.9 0 71 Europe.
## 3 17.0 302 140 US.
## 4 15.0 400 150 Missing
## 5 30.5 98 63 US.
## 6 23.0 350 125 US.
# Replace NA with mean
cars.4var[2,2] <- mean(na.omit(cars.4var$cubicinches))
cat("\n\nHead of cars with 2,2 replaced by mean: \n")
##
##
## Head of cars with 2,2 replaced by mean:
head(cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350.0 165 US.
## 2 31.9 200.6 71 Europe.
## 3 17.0 302.0 140 US.
## 4 15.0 400.0 150 Missing
## 5 30.5 98.0 63 US.
## 6 23.0 350.0 125 US.
# Replace NA with mode
our_table <- table(cars.4var$brand)
our_mode <- names(our_table) [our_table == max(our_table)]
cars.4var[4,4] <- our_mode
cat("\n\nHead of cars with 4,4 replaced by mode: \n")
##
##
## Head of cars with 4,4 replaced by mode:
head(cars.4var)
## mpg cubicinches hp brand
## 1 14.0 350.0 165 US.
## 2 31.9 200.6 71 Europe.
## 3 17.0 302.0 140 US.
## 4 15.0 400.0 150 US.
## 5 30.5 98.0 63 US.
## 6 23.0 350.0 125 US.
# Replace every NA in weight with overall mean
cars$weight <- na.aggregate(cars$weight)
# Replace NA and empty with mean in weightkg
meanVal <- mean(cars$weightkg, na.rm = TRUE)
cars$weightkg[is.na(cars$weightkg)] <- meanVal
# Write data to working directory
setwd("~/AC UNI-ORG/AB SIM/GDBA/R")
write.csv(cars, file = "carsFixed.csv")
Deal with
Outliers
# Install a package if not installed then load it
if(! "graphics" %in% installed.packages()) { install.packages("graphics", dependencies = TRUE) }
library(graphics)
# Display boxplot
cat("\n\nCHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.")
##
##
## CHL is a dataset of four virtual teams competing in delivering packages to imaginary customers.
cat("\n\nShow boxplot to identify potential outliers of dataset CHL: \n")
##
##
## Show boxplot to identify potential outliers of dataset CHL:
boxplot(Time.in.min ~ Team, data = CHL, col=(c("lightgreen", "orange", "pink", "yellow")), main="Delivery Time by Team", xlab = "Team", ylab = "Delivery time", ylim = c(0, 50), varwidth = TRUE)
# Add grid
grid(nx = NA, ny = NULL, 5, lwd = 1, col = "lightgrey") # grid only in y-direction
# Show Descriptive Stats
cat("\n\nShow descriptive stats to identify potential outliers and missing values in dataset CHL: \n")
##
##
## Show descriptive stats to identify potential outliers and missing values in dataset CHL:
summary(CHL$Time.in.min)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.30 5.50 7.50 8.06 9.50 42.00 3
Create Histogram and
Scatter Plot
# Set up the plot area
par(mfrow = c(1,1))
cat("\n\nDisplay histogram of cars dataset to identify potential outliers: \n")
##
##
## Display histogram of cars dataset to identify potential outliers:
# Create the histogram bars
hist(cars$weight,
breaks = 30,
xlim = c(0,5000),
xlab = "Weight",
ylim = c(0,40),
ylab = "Counts",
col = "lightblue",
border = "black",
main = "Histogram of car weights")
# Make a box around the plot
box(which = "plot", lty = "solid", col = "black")
# Set up the plot area
par(mfrow = c(1,1))
cat("\n\nDisplay scatter plot of weight against MPG of cars dataset to identify potential outliers: \n")
##
##
## Display scatter plot of weight against MPG of cars dataset to identify potential outliers:
# Create the scatter plot
plot(cars$weight, cars$mpg,
xlim = c(0,5000),
xlab = "Weight",
ylim = c(0,600),
ylab = "MPG",
type = "p",
pch = 16,
col = "lightblue",
border = "black",
main = "Scatter Plot of MPG by Weight")
# Add open black circles
points(cars$weight,cars$mpg, type = "p", col = "black")
Plot side-by-side
histograms
par(mfrow = c(1,2))
cat("\n\nPlot Histogram of weight and Histogram of z.weight ")
##
##
## Plot Histogram of weight and Histogram of z.weight
# Create two histograms
hist(cars$weight, breaks = 20, xlim = c(1000,5000), main = "Histogram of Weight", col = "lightblue", xlab = "Weight", ylab = "Counts")
box(which = "plot", lty = "solid", col = "black")
hist(z.weight, breaks = 20, xlim = c(-2,3), main = "Histogram of Z-score of Weight", col = "pink", xlab = "Z-score of Weight", ylab = "Counts")
box(which = "plot", lty = "solid", col = "black")
Skewness +
Kurtosis
# Install a package if not installed then load it
if(! "moments" %in% installed.packages()) { install.packages("moments", dependencies = TRUE) }
library(moments)
cat("\n\nSkewness and Kurtosis are indicators for the shape of a distribution. ")
##
##
## Skewness and Kurtosis are indicators for the shape of a distribution.
cat("If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.\n ")
## If Skewness and Kurtosis between -2 and 2, normality of data can be assumed.
##
cat("\nSkewness: ", format(skewness(cars$weight), digits = 4, scientific = FALSE))
##
## Skewness: 0.2686
cat("\nKurtosis: ", format(kurtosis(cars$weight), digits = 4, scientific = FALSE))
##
## Kurtosis: 2.473
Transformations to
reach Normality
cat("\n\nOther transformations to potentially reach normality. ")
##
##
## Other transformations to potentially reach normality.
sqrt.weight <- sqrt(cars$weight) # Square root
cat("If p-value < 0.05, data are not normally distributed.")
## If p-value < 0.05, data are not normally distributed.
cat("\n\np-value of sqrt: ", format(shapiro.test(sqrt.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of sqrt: 0.000000002395
ln.weight <- log(cars$weight) # Natural log
cat("\n\np-value of ln.weight: ", format(shapiro.test(ln.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of ln.weight: 0.00000000000000000000001447
invsqrt.weight <- 1 / sqrt.weight # Inverse square root
cat("\n\np-value of invsqrt.weight: ", format(shapiro.test(invsqrt.weight)$p.value, digits = 4, scientific = FALSE))
##
##
## p-value of invsqrt.weight: 0.00000000000000000000000000000000985
Histogram with Normal
Distribution
par(mfrow = c(1,1))
x <- rnorm(1000000, mean = mean(invsqrt.weight), s = sd(invsqrt.weight))
cat("\n\nPlot Histogram of invsqrt.weight. ")
##
##
## Plot Histogram of invsqrt.weight.
hist(invsqrt.weight,
breaks = 30,
col = "orange",
prob = TRUE,
border = "black",
xlab = "Inverse Square Root of Weight",
ylab = "Counts",
main = "Histogram of Inverse Square Root of Weight")
box(which = "plot",
lty = "solid",
col = "black")
# Overlay with normal density
lines(density(x), col = "red")
Normal Q-Q Plot
cat("\n\nPlot Q-Q Plot of invsqrt.weight. ")
##
##
## Plot Q-Q Plot of invsqrt.weight.
qqnorm(invsqrt.weight,
dataX = TRUE,
col = "red",
ylim = c(0.02,0.04),
main = "Normal Q-Q Plot of Inverse Square Root of Weight")
qqline(invsqrt.weight,
col = "blue",
datax = FALSE)
Create indicator
Variables
north_flag <- east_flag <- south_flag <- c(rep(NA,10))
region <- c(rep(c("north", "south", "east", "west"),2), "north", "south")
# Change region variables to indicators
for (i in 1:length(region)) {
if(region[i] == "north") north_flag[i] = 1
else north_flag[i] = 0
if(region[i] == "south") south_flag[i] = 1
else south_flag[i] = 0
if(region[i] == "east") east_flag[i] = 1
else east_flag[i] = 0
}
region; north_flag; south_flag; east_flag
## [1] "north" "south" "east" "west" "north" "south" "east" "west" "north"
## [10] "south"
## [1] 1 0 0 0 1 0 0 0 1 0
## [1] 0 1 0 0 0 1 0 0 0 1
## [1] 0 0 1 0 0 0 1 0 0 0
Find Duplicated
Records
# For number of duplicate records, use anyDuplicated
anyDuplicated(cars)
## [1] 0
# To examine each record, use duplicated (TRUE/FALSE per record)
duplicated(cars)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Let's duplicate the first record
new.cars <- rbind(cars, cars[1,])
# Check for duplicates
anyDuplicated(new.cars)
## [1] 262
duplicated(new.cars)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE