library("tidyr") library("dplyr") # Nactete datove soubory data <- read.csv(file = "flights.csv", header = TRUE) d <- read.csv(file = "carrier_names.csv", header = FALSE) dopravci <- d$V2 names(dopravci) <- d$V1 dopravci cancellation.codes <- c("A" = "carrier", "B" = "weather", "C" = "control", "D" = "security", "N" = "not cancelled") # Prozkoumejte nactena data # Vypiste unikatni kody dopravcu (UniqueCarrier) a jejich cetnosti # Do tabulky data doplnte novy sloupec Carrier s plnym jmenem dopravce (vyuzijte promennou d) # Kod zruseni letu (CancellationCode) "" nahradte kodem "N", ostatni kody ponechte # Do tabulky data doplnte novy sloupec CancellationReason s vysvetlenim zruseni letu (vyuzijte promennou cancellation.codes) # Vyzkousejte nasledujici prikazy, vysvetlete co je jejich vysledkem # Zkousejte si podobne prikazy pro odpovedi na podobne otazky # SELECT df <- select(data, ActualElapsedTime, AirTime, ArrDelay, DepDelay) glimpse(df) df <- select(data, Origin:Cancelled) glimpse(df) df <- select(data, 1:4, 12:21) glimpse(df) df <- select(data, ends_with("Delay")) glimpse(df) # , : c() - # starts_with(): starts with a prefix # ends_with(): ends with a prefix # contains(): contains a literal string # matches(): matches a regular expression # num_range(): a numerical range like x01, x02, x03. # one_of(): variables in character vector. # everything(): all variables. df <- data %>% select( Carrier, ends_with("Num"), starts_with("Cancel")) df <- data %>% select(contains("time"), contains("Delay")) # MUTATE data %>% mutate(ExtraDelay = ArrDelay - DepDelay) %>% select(ExtraDelay) data <- data %>% mutate( ExtraDelay = ArrDelay - DepDelay, TaxiTime = TaxiIn + TaxiOut, GroundTime = ActualElapsedTime - AirTime, GroundTimeRatio = GroundTime / ActualElapsedTime, DistanceKm = 1.852 * Distance, AvgSpeed = Distance / AirTime * 60, ArrMinutes = 60 * (ArrTime %/% 100) + (ArrTime %% 100), DepMinutes = 60 * (DepTime %/% 100) + (DepTime %% 100) ) # FILTER data %>% filter(TaxiTime != GroundTime) data %>% filter(DistanceKm >= 3000) data %>% filter(Carrier %in% (c("JetBlue", "Southwest", "Delta"))) data %>% filter(TaxiTime > AirTime) data %>% filter((DepTime <= 60 * 5) | (ArrTime >= 60 * 22)) data %>% filter((DepDelay > 0) | (ArrDelay < 0)) data %>% filter((CancellationReason == "weather") & (DayOfWeek %in% c(6, 7))) data %>% filter((CancellationCode != "N") & (DepDelay > 0)) data %>% filter(DistanceKm >= 3000) %>% glimpse data %>% filter(Dest == "JFK") %>% nrow # ARRANGE df3 <- data %>% filter(Cancelled == 1, !is.na(DepDelay)) df3 %>% arrange(DepDelay) df3 %>% arrange(CancellationCode) df3 %>% arrange(UniqueCarrier, DepDelay) df3 %>% arrange(UniqueCarrier, desc(DepDelay)) df3 %>% arrange(Dest == 'DFW', DepMinutes < 8*60) %>% arrange(desc(AirTime)) # SUMMARISE df4 <- data %>% summarize( min_dist = min(Distance), max_dist = max(Distance) ) df4 # Aggregate functions defined in R: # min(x) - minimum value of vector x # max(x) - maximum value of vector x # mean(x) - mean value of vector x # median(x) - median value of vector x # quantile(x, p) - pth quantile of vector x # sd(x) - standard deviation of vector x # var(x) - variance of vector x # IQR(x) - Inter Quartile Range (IQR) of vector x data %>% filter(!is.na(ArrDelay)) %>% summarise( earliest = min(ArrDelay), average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay) ) # dplyr has some of its own aggregate functions: # first(x) - The first element of vector x # last(x) - The last element of vector x # nth(x, n) - The nth element of vector x # n() - The number of rows in the data.frame or group of observations that summarise() describes # n_distinct(x) - The number of unique values in vector x data %>% summarise( n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest), dest100 = nth(Dest, 100) ) data %>% filter(Carrier == 'Delta') %>% summarise( n_flights = n(), n_canc = sum(Cancelled), p_canc = n_canc/n_flights * 100, avg_delay = mean(ArrDelay, na.rm = TRUE) ) # GROUP_BY data %>% group_by(Carrier) %>% summarise( n_flights = n(), n_canc = sum(Cancelled), perc_canc = n_canc / n_flights * 100, avg_delay = mean(ArrDelay, na.rm = TRUE) ) %>% arrange(avg_delay, perc_canc) data %>% group_by(Carrier, Month) %>% summarise( n_flights = n(), n_canc = sum(Cancelled), perc_canc = n_canc / n_flights * 100, avg_delay = mean(ArrDelay, na.rm = TRUE) ) %>% arrange(Carrier, Month) %>% print(n = Inf) data %>% group_by(DayOfWeek) %>% summarize(avg_taxi = mean(TaxiIn + TaxiOut, na.rm = TRUE)) %>% arrange(desc(avg_taxi)) # Auto / letadlo ? df <- data %>% select(Dest, UniqueCarrier, DistanceKm, ActualElapsedTime) %>% mutate( RealTime = ActualElapsedTime + 120, AvgSpeed = DistanceKm / RealTime * 60 ) df %>% filter(!is.na(AvgSpeed), AvgSpeed < 130) %>% summarize( n_less = n(), n_dest = n_distinct(Dest), min_dist = min(DistanceKm), max_dist = max(DistanceKm) ) data %>% filter(!is.na(ArrDelay)) %>% group_by(Carrier) %>% summarize(mean_delay = mean(ArrDelay)) %>% mutate(rank = rank(mean_delay)) %>% arrange(rank) data %>% filter(!is.na(ArrDelay), ArrDelay > 0) %>% group_by(Carrier) %>% summarize(mean_delay = mean(ArrDelay)) %>% mutate(rank = rank(mean_delay)) %>% arrange(rank) # DALSI UKOLY # 1. Ktere letadlo leta nejcasteji z Houstonu? Kolikrat? Zvlast z jednotlivych letist. # 2. Kolik letadel leta jen do jedne destinace? # 3. Ktere lety odletaji nejcasteji? Kolikrat a kam? Vyposte prvnich 5. # Vypiste nejcastejsi destinace kazdeho dopravce a pocty letu do techto destinaci, seradte podle nich sestupne. # Vypiste dopravce, ktery do jednotlivych destinaci leta nejcasteji, vcetne poctu, seradte abcedne dle destinace.