Data Exploration
Exploration of the raw data - base and dplyr
bmi_1 = read_excel("bmi.xlsx", sheet = 2)
class(bmi_1)
[1] "tbl_df" "tbl" "data.frame"
- Check the dimensions of bmi:
dim(bmi_1)
[1] 199 2
- View the column names of bmi:
colnames(bmi_1)
[1] "Country" "BMI_1980"
str(bmi_1)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 199 obs. of 2 variables:
$ Country : chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
$ BMI_1980: num 20.4 25.2 23.7 25.7 20.1 ...
# library(dplyr), install.packages(“dplyr“)
glimpse(bmi_1)
Observations: 199
Variables: 2
$ Country [3m[38;5;246m<chr>[39m[23m "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia"...
$ BMI_1980 [3m[38;5;246m<dbl>[39m[23m 20.44348, 25.17427, 23.67764, 25.67324, 20.06763, 24.22235, 23.84436, 25.77727, 23.63058, 23.90340, 26....
summary(bmi_1)
Country BMI_1980
Length:199 Min. :18.47
Class :character 1st Qu.:21.38
Mode :character Median :23.98
Mean :23.55
3rd Qu.:25.40
Max. :28.28
head(bmi_1, n = 10)
tail(bmi_1, n = 10)
Exploration of the raw data - psych
# library(psych), install.packages("psych")
- Check the structure of bmi, the psych way:
describe(bmi_1)
- Check the structure of bmi, the psych way - by group:
bmi_2 <- bmi_1 %>%
group_by(Country) %>%
mutate(Health = if_else(BMI_1980 <= 18.5 | BMI_1980 >= 24.9,
"Unhealthy", "Normal"))
describe.by(bmi_2$BMI_1980, group = factor(bmi_2$Health))
Descriptive statistics by group
group: Normal
---------------------------------------------------------------------------------------------
group: Unhealthy
Introduction to Data Wrangling
Infrastructure <- read.csv2("Infrastructure.csv")
- Preview Infrastructure with str():
str(Infrastructure)
'data.frame': 133 obs. of 7 variables:
$ Country : Factor w/ 133 levels "Afghanistan",..: 126 97 26 49 40 125 56 120 43 35 ...
$ ISO3 : Factor w/ 133 levels "AFG","ALB","ALG",..: 127 97 25 49 40 124 57 121 43 35 ...
$ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
$ Ports : int 24 7 15 7 14 14 10 9 13 7 ...
$ Roadway_Coverage: int 6586610 982000 3860800 3320410 951200 394428 1210251 352046 644480 65050 ...
$ Railway_Coverage: int 224792 87157 86000 63974 29640 16454 27182 8699 41981 5083 ...
$ Airports : int 13513 1218 507 346 464 460 175 98 539 83 ...
- Coerce Country to character:
Infrastructure$Country <- as.character(Infrastructure$Country)
Infrastructure$Rank <- as.character(Infrastructure$Rank)
- Look at Infrastructure once more with str():
str(Infrastructure)
'data.frame': 133 obs. of 7 variables:
$ Country : chr "United States" "Russia" "China" "India" ...
$ ISO3 : Factor w/ 133 levels "AFG","ALB","ALG",..: 127 97 25 49 40 124 57 121 43 35 ...
$ Rank : chr "1" "2" "3" "4" ...
$ Ports : int 24 7 15 7 14 14 10 9 13 7 ...
$ Roadway_Coverage: int 6586610 982000 3860800 3320410 951200 394428 1210251 352046 644480 65050 ...
$ Railway_Coverage: int 224792 87157 86000 63974 29640 16454 27182 8699 41981 5083 ...
$ Airports : int 13513 1218 507 346 464 460 175 98 539 83 ...
Strings
- Load the stringr package:
# library("stringr") install.packages("stringr")
- Trim all leading and trailing whitespace:
name = c(" Filip ", "Nick ", " Jonathan")
str_trim(name)
[1] "Filip" "Nick" "Jonathan"
- Pad these strings with leading zeros:
pad = c("23485W", "8823453Q", "994Z")
str_pad(pad, width = 9, side = "left", pad = "0")
[1] "00023485W" "08823453Q" "00000994Z"
- Print state abbreviations:
head(Manpower$Country)
[1] United States Russia China India France United Kingdom
133 Levels: Afghanistan Albania Algeria Angola Argentina Armenia Australia Austria Azerbaijan Bahrain Bangladesh ... Zimbabwe
- Make states all uppercase and save result:
# to states_upper
states_upper <- toupper(Manpower$Country)
head(states_upper)
[1] "UNITED STATES" "RUSSIA" "CHINA" "INDIA" "FRANCE" "UNITED KINGDOM"
- Make states_upper all lowercase again:
states_lower <- tolower(Manpower$Country)
head(states_lower)
[1] "united states" "russia" "china" "india" "france" "united kingdom"
- Look at the head of Infrastructure:
head(Infrastructure)
- Detect all “Republic” in Country:
str_detect(Infrastructure$Country, "Republic")
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[21] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
[121] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
- In the Country column, replace “Republic” with “R”…:
Infrastructure$Country <- str_replace(Infrastructure$Country, "Republic", "R")
Valid Values
- In R, represented as NA
- May appear in other forms
- N/A (Excel)
- Single dot (SPSS, SAS)
- Empty string
- Inf - “Infinite value” (indicative of outliers?):
1/0
[1] Inf
1/0 + 1/0
[1] Inf
33333^33333
[1] Inf
- NaN - “Not a number” (rethink a variable?):
0/0
[1] NaN
1/0 - 1/0
[1] NaN
"treatment", "123", "A"
23.44, 120, NaN, Inf
4L, 1123L
factor("Hello"), factor(8)
TRUE, FALSE, NA
Missing Values
name = c("Jerry", "Beth", "Rick", "Morty")
n_friends = c(NaN, NA, Inf, 2)
status = c("Listening to human music", "Happy Family", "Garage", "")
social_df = data.frame(cbind(name, n_friends, status))
- Call is.na() on the full social_df to spot all NAs:
is.na(social_df)
name n_friends status
[1,] FALSE FALSE FALSE
[2,] FALSE TRUE FALSE
[3,] FALSE FALSE FALSE
[4,] FALSE FALSE FALSE
- Use the any() function to ask whether there are any NAs in the data:
any(is.na(social_df))
[1] TRUE
- View a summary() of the dataset:
summary(social_df)
name n_friends status
Beth :1 2 :1 :1
Jerry:1 Inf :1 Garage :1
Morty:1 NaN :1 Happy Family :1
Rick :1 NA's:1 Listening to human music:1
- Call table() on the status column:
table(social_df$status)
Garage Happy Family Listening to human music
1 1 1 1
- Replace all empty strings in status with NA:
social_df$status[social_df$status == ""] <- NA
- Print social_df to the console:
social_df
- Use complete.cases() to see which rows have no missing values:
complete.cases(social_df)
[1] TRUE FALSE TRUE FALSE
- Use na.omit() to remove all rows with any missing values:
na.omit(social_df)
Outliers
Infrastructure = read.csv2("Infrastructure.csv")
hist(Infrastructure$Ports)
boxplot(Infrastructure$Airports)
plot(Infrastructure$Railway_Coverage, Infrastructure$Roadway_Coverage)
