04. Čištění dat


Harmonogram
▫01. Rekapitulace
▫02. Explorace
▫03. Příprava pro analýzu
▫
▫04. Chybějící a odlehlé hodnoty

Čištění dat
Explorace hrubých dat - base
# Matice
bmi_1 = read_excel("bmi.xlsx", sheet = 2)
# Check the class of bmi
class(bmi_1)
# Check the dimensions of bmi
dim(bmi_1)
# View the column names of bmi
colnames(bmi_1)
# Struktura dat
str(bmi_1)
# Glimpse
# install.packages(“dplyr“)
library(dplyr)
glimpse(bmi_1)
# Sumarizace
summary(bmi_1)
# Prvních 10 a posledních 10 řádků
head(bmi_1, n = 10)
tail(bmi_1, n = 10)

Čištění dat
Explorace hrubých dat - psych
# Load psych
install.packages("psych")
library(psych)
# Check the structure of bmi, the psych way
describe(bmi_1)

Čištění dat
Explorace hrubých dat - summarytools
# Load summarytools
install.packages(“summarytools")
library(summarytools)
# Data
Manpower = read.csv(“Manpower.csv“)
# Check the structure of bmi, the psych way
view(dfSummary(Manpower))

Čištění dat
Příprava dat pro analýzu
# Matice
Infrastructure = read.csv2(“Infrastructure.csv“)
# Preview Infrastructure with str()
str(Infrastructure)
# Coerce Country to character
Infrastructure$Country <- as.character(Infrastructure$Country)
# Coerce Rank to factor
Infrastructure$Rank <- as.character(Infrastructure$Rank)
# Look at Infrastructure once more with str()
str(Infrastructure)

Čištění dat
Příprava dat pro analýzu – dílčí manipulace se strings
# Load the stringr package
install.packages("stringr")
library("stringr")
# Trim all leading and trailing whitespace
name = c(" Filip ", "Nick ", " Jonathan")
str_trim(name)
# Pad these strings with leading zeros
pad = c("23485W", "8823453Q", "994Z")
str_pad(pad, width = 9, side = "left", pad =
"0")
# Print state abbreviations
Manpower$Country
# Make states all uppercase and save result
# to states_upper
states_upper =
toupper(Manpower$Country)
states_upper
# Make states_upper all lowercase again
states_lower = tolower(Manpower$Country)
states_lower

Čištění dat
Příprava dat pro analýzu – dílčí manipulace se strings
# Look at the head of Infrastructure
head(Infrastructure)
# Detect all "Republic" in Country
str_detect(Infrastructure$Country,
"Republic")
# In the Country column, replace "Republic" with "R"...
Infrastructure$Country <-
str_replace(Infrastructure$Country,
"Republic", "R")

Čištění dat
Příprava dat pro analýzu – chybějící data
● character: "treatment", "123", "A"
● numeric: 23.44, 120, NaN, Inf
● integer: 4L, 1123L
● factor: factor("Hello"), factor(8)
● logical: TRUE, FALSE, NA
● Inf - "Infinite value" (indicative of outliers?)
● 1/0
● 1/0 + 1/0
● 33333^33333
● NaN - "Not a number" (rethink a variable?)
● 0/0
● 1/0 - 1/0
● In R, represented as NA
● May appear in other forms
● #N/A (Excel)
● Single dot (SPSS, SAS)
● Empty string

Čištění dat
Příprava dat pro analýzu – chybějící data
name = c("Jerry", "Beth", "Rick", "Morty")
n_friends = c(NaN, NA, Inf, 2)
status = c("Listening to human music", "Happy Family", "Garage", "")
social_df = data.frame(cbind(name, n_friends, status))
# Replace all empty strings in status with NA
social_df$status[social_df$status == ""] <- NA
# Print social_df to the console
social_df
# Use complete.cases() to see which rows have no missing values
complete.cases(social_df)
# Use na.omit() to remove all rows with any missing values
na.omit(social_df)
# Call is.na() on the full social_df to spot all NAs
is.na(social_df)
# Use the any() function to ask whether there are any NAs # in the data
any(is.na(social_df))
# View a summary() of the dataset
summary(social_df)
# Call table() on the status column
table(social_df$status)

Čištění dat
Odlehlé hodnoty – explorace grafy
# Matice
Infrastructure = read.csv2(“Infrastructure.csv“)
# Histogram
hist(Infrastructure$Ports)
# Boxplot
boxplot(Infrastructure$Airports)
# Scatterplot
plot(Infrastructure$Railway_Coverage, Infrastructure$Roadway_Coverage)