Outline


    01. dplyr, tbl

    02. select, mutate

    03. filter, arrange

    04. summarise, %>%

    05. group_by


    Packages


    library("dplyr")
    library("tidyr")
    library("stringr")

    dplyr and tibble


    • Data:
    Titanic = read.csv("Train.csv")


    • Object class:
    class(Titanic)
    [1] "data.frame"


    • as tibble:
    Titanic = as_tibble(Titanic)


    • Data structure:
    glimpse(Titanic, width = 50)
    Observations: 891
    Variables: 12
    $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9...
    $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1...
    $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3...
    $ Name        <fct> "Braund, Mr. Owen Harris"...
    $ Sex         <fct> male, female, female, fem...
    $ Age         <dbl> 22, 38, 26, 35, 35, NA, 5...
    $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0...
    $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2...
    $ Ticket      <fct> A/5 21171, PC 17599, STON...
    $ Fare        <dbl> 7.2500, 71.2833, 7.9250, ...
    $ Cabin       <fct> , C85, , C123, , , E46, ,...
    $ Embarked    <fct> S, C, S, S, S, Q, S, S, S...


    • Variable naming:
    # Different variable name
    Titanic <- rename(Titanic, # data frame
                      Departure_Point = Embarked) # new name = old name
    # Dealing with missing values - dplyr way
    Titanic <- mutate(Titanic, Departure_Point = na_if(Departure_Point, ""))
    # Different levels
    Titanic$Departure_Point <- recode(Titanic2$Departure_Point, 
                                               "C" = "Cherbourg", 
                                               "Q" = "Queenstown", 
                                               "S" = "Southampton")
    # Take a look at the data
    summary(Titanic$Departure_Point)
                  Cherbourg  Queenstown Southampton        NA's 
              0         168          77         644           2 

    select

    Select/rename variables by name


    • Single variable selection:
    head(select(Titanic, Name))


    • Select multiple variables by their order:
    head(select(Titanic, PassengerId:Survived, Name:Age))


    • Select multiple variables based on their wording:
    head(select(Titanic, contains("e")))


    • Select multiple variables - combination of the previous steps
    head(select(Titanic, 1:2, 
                         contains("t")))

    mutate

    Create or transform variables


    • Create a new variable:
    Titanic_2018 = mutate(Titanic, Age_2018 = (Age + (2018-1912)))
    Titanic_2018


    • Create more variables at the same time:
    Titanic_02 = mutate(Titanic, Price_Year_Ratio = (Fare / Age), 
                                 Family = SibSp + Parch)
    Titanic_02


    filter

    Return rows with matching conditions


    • Filter rows based on logical operators:
    filter(Titanic, Age < 18)
    filter(Titanic, Departure_Point %in% c("Southampton", "Queenstown"))


    • Multiple logical filters:
    filter(Titanic, Age < 18, 
                    Survived == 1)
    filter(Titanic, Age < 18, 
                    Pclass == 1 | Pclass == 2)


    • Filtering a string:
    library(stringr)
    filter(Titanic, str_detect(Name, "Rose"))

    arrange

    Arrange rows by variables


    • Arrange (order) survivors by their age. Omit missing values:
    Survived = filter(Titanic, Survived == 1, !is.na(Age))
    head(arrange(Survived, Age))


    • Arrange (order) survivors by their age - from the oldest to the youngest. Omit missing values:
    head(arrange(Survived, desc(Age)))


    • Arrange (order) survivors by their age and the port of their embarkement:
    head(arrange(Survived, desc(Age), Departure_Point))

    summarize

    Reduce multiple values down to a single value


    • The lowest ticket price, the oldest passenger:
    summarize(Titanic, Fare_Min = min(Fare), Age_Max = max(Age))


    • The average age of the survivors:
    summarize(filter(Titanic, Survived == 1), Age_Avg = mean(Age, na.rm = TRUE))


    • The average ticket price, its standard deviation, median, min and max:
    Titanic_03 = filter(Titanic, !is.na(Fare))
    summarize(Titanic_03, Mean_Fare = mean(Fare), 
                          SD_Fare = sd(Fare), 
                          Median_Fare = median(Fare), 
                          Min_Fare = min(Fare), 
                          Max_Fare = max(Fare))


    %>%

    Control of the code flow


    • The average age of the passengers travelling in the third class and did not survive:
    Titanic %>%
            filter(Pclass == 3, 
                   Survived == 0) %>%
            summarize(Age_Died_Avg = mean(Age, na.rm = TRUE))


    • The highest ticket price in the first for passenger by name ‘William’ who survived:
    Titanic %>%
            filter(Pclass == 1,
                   str_detect(Name, "William"),
                   Survived == 1) %>%
            summarize(Fare_Avg_Will = min(Fare))

    group_by

    Group by one or more variables


    • The average age of the passengers in the third class who did not survive:
    Titanic %>%
              group_by(Sex) %>%
              summarize(Mean_Fare = mean(Fare, na.rm = TRUE), 
                        Count = n(), 
                        Mean_Age = mean(Age, na.rm = TRUE))
    LS0tDQp0aXRsZTogIioqMDUuIERhdGEgV3JhbmdsaW5nKioiDQpzdWJ0aXRsZTogIlIxMDEiDQphdXRob3I6ICJWw610IEdhYnJoZWwiDQpvdXRwdXQ6IA0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogdHJ1ZQ0KICAgIHRvY19mbG9hdDogdHJ1ZQ0KICAgIHRoZW1lOiB5ZXRpDQogICAgY29kZV9mb2xkaW5nOiAic2hvdyINCi0tLQ0KIyMgT3V0bGluZQ0KPGJyPg0KDQo8dWw+IA0KIyMjIyAwMS4gKipkcGx5cioqLCAqKnRibCoqDQojIyMjIDAyLiAgKipzZWxlY3QqKiwgKiptdXRhdGUqKg0KIyMjIyAwMy4gICoqZmlsdGVyKiosICoqYXJyYW5nZSoqDQojIyMjIDA0LiAgKipzdW1tYXJpc2UqKiwgKiolPiUqKg0KIyMjIyAwNS4gICoqZ3JvdXBfYnkqKg0KPHVsLz4NCjxicj4NCg0KIyMgUGFja2FnZXMNCjxicj4NCg0KYGBge3J9DQpsaWJyYXJ5KCJkcGx5ciIpDQpsaWJyYXJ5KCJ0aWR5ciIpDQpsaWJyYXJ5KCJzdHJpbmdyIikNCmBgYA0KDQoNCg0KIyMgW2RwbHlyXShodHRwczovL2RwbHlyLnRpZHl2ZXJzZS5vcmcvKSBhbmQgW3RpYmJsZV0oaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL3RpYmJsZS92aWduZXR0ZXMvdGliYmxlLmh0bWwpDQo8YnI+DQoNCiogKipEYXRhKio6DQpgYGB7cn0NClRpdGFuaWMgPSByZWFkLmNzdigiVHJhaW4uY3N2IikNCmBgYA0KPGJyPg0KDQoqICoqT2JqZWN0IGNsYXNzKio6DQpgYGB7cn0NCmNsYXNzKFRpdGFuaWMpDQoNCmBgYA0KPGJyPg0KDQoqIGFzICoqdGliYmxlKio6DQpgYGB7cn0NClRpdGFuaWMgPSBhc190aWJibGUoVGl0YW5pYykNCmBgYA0KPGJyPg0KDQoqIERhdGEgKipzdHJ1Y3R1cmUqKjoNCmBgYHtyLCBzaXplPSAidGlueSJ9DQpnbGltcHNlKFRpdGFuaWMsIHdpZHRoID0gNTApDQpgYGANCjxicj4NCg0KKiBWYXJpYWJsZSAqKm5hbWluZyoqOg0KYGBge3J9DQojIERpZmZlcmVudCB2YXJpYWJsZSBuYW1lDQpUaXRhbmljIDwtIHJlbmFtZShUaXRhbmljLCAjIGRhdGEgZnJhbWUNCiAgICAgICAgICAgICAgICAgIERlcGFydHVyZV9Qb2ludCA9IEVtYmFya2VkKSAjIG5ldyBuYW1lID0gb2xkIG5hbWUNCg0KIyBEZWFsaW5nIHdpdGggbWlzc2luZyB2YWx1ZXMgLSBkcGx5ciB3YXkNClRpdGFuaWMgPC0gbXV0YXRlKFRpdGFuaWMsIERlcGFydHVyZV9Qb2ludCA9IG5hX2lmKERlcGFydHVyZV9Qb2ludCwgIiIpKQ0KDQojIERpZmZlcmVudCBsZXZlbHMNClRpdGFuaWMkRGVwYXJ0dXJlX1BvaW50IDwtIHJlY29kZShUaXRhbmljMiREZXBhcnR1cmVfUG9pbnQsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJDIiA9ICJDaGVyYm91cmciLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiUSIgPSAiUXVlZW5zdG93biIsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJTIiA9ICJTb3V0aGFtcHRvbiIpDQoNCiMgVGFrZSBhIGxvb2sgYXQgdGhlIGRhdGENCnN1bW1hcnkoVGl0YW5pYyREZXBhcnR1cmVfUG9pbnQpDQpgYGANCg0KIyMgW3NlbGVjdF0oaHR0cHM6Ly9kcGx5ci50aWR5dmVyc2Uub3JnL3JlZmVyZW5jZS9zZWxlY3QuaHRtbCkNCiMjIyAqU2VsZWN0L3JlbmFtZSB2YXJpYWJsZXMgYnkgbmFtZSoNCjxicj4NCg0KKiBTaW5nbGUgdmFyaWFibGUgc2VsZWN0aW9uOg0KYGBge3J9DQpoZWFkKHNlbGVjdChUaXRhbmljLCBOYW1lKSkNCmBgYA0KPGJyPg0KDQoqIFNlbGVjdCBtdWx0aXBsZSB2YXJpYWJsZXMgYnkgdGhlaXIgb3JkZXI6DQpgYGB7cn0NCmhlYWQoc2VsZWN0KFRpdGFuaWMsIFBhc3NlbmdlcklkOlN1cnZpdmVkLCBOYW1lOkFnZSkpDQpgYGANCjxicj4NCg0KKiBTZWxlY3QgbXVsdGlwbGUgdmFyaWFibGVzIGJhc2VkIG9uIHRoZWlyIHdvcmRpbmc6DQpgYGB7cn0NCmhlYWQoc2VsZWN0KFRpdGFuaWMsIGNvbnRhaW5zKCJlIikpKQ0KYGBgDQo8YnI+DQoNCiogU2VsZWN0IG11bHRpcGxlIHZhcmlhYmxlcyAtIGNvbWJpbmF0aW9uIG9mIHRoZSBwcmV2aW91cyBzdGVwcw0KYGBge3J9DQpoZWFkKHNlbGVjdChUaXRhbmljLCAxOjIsIA0KICAgICAgICAgICAgICAgICAgICAgY29udGFpbnMoInQiKSkpDQpgYGANCiMjIFttdXRhdGVdKGh0dHBzOi8vZHBseXIudGlkeXZlcnNlLm9yZy9yZWZlcmVuY2UvbXV0YXRlLmh0bWwpDQojIyMgKkNyZWF0ZSBvciB0cmFuc2Zvcm0gdmFyaWFibGVzKg0KPGJyPg0KDQoqICoqQ3JlYXRlKiogYSBuZXcgKip2YXJpYWJsZSoqOg0KYGBge3J9DQpUaXRhbmljXzIwMTggPSBtdXRhdGUoVGl0YW5pYywgQWdlXzIwMTggPSAoQWdlICsgKDIwMTgtMTkxMikpKQ0KVGl0YW5pY18yMDE4DQpgYGANCjxicj4NCg0KKiAqKkNyZWF0ZSBtb3JlIHZhcmlhYmxlcyoqIGF0IHRoZSBzYW1lIHRpbWU6DQpgYGB7cn0NClRpdGFuaWNfMDIgPSBtdXRhdGUoVGl0YW5pYywgUHJpY2VfWWVhcl9SYXRpbyA9IChGYXJlIC8gQWdlKSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgIEZhbWlseSA9IFNpYlNwICsgUGFyY2gpDQpUaXRhbmljXzAyDQpgYGANCjxicj4NCg0KIyMgW2ZpbHRlcl0oaHR0cHM6Ly9kcGx5ci50aWR5dmVyc2Uub3JnL3JlZmVyZW5jZS9maWx0ZXIuaHRtbCkNCiMjIyAqUmV0dXJuIHJvd3Mgd2l0aCBtYXRjaGluZyBjb25kaXRpb25zKg0KPGJyPg0KDQoqICoqRmlsdGVyIHJvd3MqKiBiYXNlZCBvbiAqKmxvZ2ljYWwgb3BlcmF0b3JzKio6DQpgYGB7cn0NCmZpbHRlcihUaXRhbmljLCBBZ2UgPCAxOCkNCg0KZmlsdGVyKFRpdGFuaWMsIERlcGFydHVyZV9Qb2ludCAlaW4lIGMoIlNvdXRoYW1wdG9uIiwgIlF1ZWVuc3Rvd24iKSkNCmBgYA0KPGJyPg0KDQoqICoqTXVsdGlwbGUqKiBsb2dpY2FsIGZpbHRlcnM6DQpgYGB7cn0NCmZpbHRlcihUaXRhbmljLCBBZ2UgPCAxOCwgDQogICAgICAgICAgICAgICAgU3Vydml2ZWQgPT0gMSkNCg0KZmlsdGVyKFRpdGFuaWMsIEFnZSA8IDE4LCANCiAgICAgICAgICAgICAgICBQY2xhc3MgPT0gMSB8IFBjbGFzcyA9PSAyKQ0KYGBgDQo8YnI+DQoNCiogRmlsdGVyaW5nIGEgKipzdHJpbmcqKjoNCmBgYHtyfQ0KbGlicmFyeShzdHJpbmdyKQ0KDQpmaWx0ZXIoVGl0YW5pYywgc3RyX2RldGVjdChOYW1lLCAiUm9zZSIpKQ0KYGBgDQoNCiMjIFthcnJhbmdlXShodHRwczovL2RwbHlyLnRpZHl2ZXJzZS5vcmcvcmVmZXJlbmNlL2FycmFuZ2UuaHRtbCkNCiMjIyAqQXJyYW5nZSByb3dzIGJ5IHZhcmlhYmxlcyoNCjxicj4NCg0KKiBBcnJhbmdlIChvcmRlcikgc3Vydml2b3JzIGJ5IHRoZWlyIGFnZS4gT21pdCBtaXNzaW5nIHZhbHVlczoNCmBgYHtyfQ0KU3Vydml2ZWQgPSBmaWx0ZXIoVGl0YW5pYywgU3Vydml2ZWQgPT0gMSwgIWlzLm5hKEFnZSkpDQoNCmhlYWQoYXJyYW5nZShTdXJ2aXZlZCwgQWdlKSkNCmBgYA0KPGJyPg0KDQoqIEFycmFuZ2UgKG9yZGVyKSBzdXJ2aXZvcnMgYnkgdGhlaXIgYWdlIC0gZnJvbSB0aGUgb2xkZXN0IHRvIHRoZSB5b3VuZ2VzdC4gT21pdCBtaXNzaW5nIHZhbHVlczoNCmBgYHtyfQ0KaGVhZChhcnJhbmdlKFN1cnZpdmVkLCBkZXNjKEFnZSkpKQ0KYGBgDQo8YnI+DQoNCiogQXJyYW5nZSAob3JkZXIpIHN1cnZpdm9ycyBieSB0aGVpciBhZ2UgYW5kIHRoZSBwb3J0IG9mIHRoZWlyIGVtYmFya2VtZW50Og0KYGBge3J9DQpoZWFkKGFycmFuZ2UoU3Vydml2ZWQsIGRlc2MoQWdlKSwgRGVwYXJ0dXJlX1BvaW50KSkNCmBgYA0KDQojIyBbc3VtbWFyaXplXShodHRwczovL2RwbHlyLnRpZHl2ZXJzZS5vcmcvcmVmZXJlbmNlL3N1bW1hcml6ZS5odG1sKQ0KIyMjICpSZWR1Y2UgbXVsdGlwbGUgdmFsdWVzIGRvd24gdG8gYSBzaW5nbGUgdmFsdWUqDQo8YnI+DQoNCiogVGhlICoqbG93ZXN0KiogdGlja2V0IHByaWNlLCB0aGUgKipvbGRlc3QqKiBwYXNzZW5nZXI6DQpgYGB7cn0NCnN1bW1hcml6ZShUaXRhbmljLCBGYXJlX01pbiA9IG1pbihGYXJlKSwgQWdlX01heCA9IG1heChBZ2UpKQ0KYGBgDQo8YnI+DQoNCiogVGhlICoqYXZlcmFnZSoqIGFnZSBvZiB0aGUgc3Vydml2b3JzOg0KYGBge3J9DQpzdW1tYXJpemUoZmlsdGVyKFRpdGFuaWMsIFN1cnZpdmVkID09IDEpLCBBZ2VfQXZnID0gbWVhbihBZ2UsIG5hLnJtID0gVFJVRSkpDQpgYGANCjxicj4NCg0KKiBUaGUgKiphdmVyYWdlKiogdGlja2V0IHByaWNlLCBpdHMgKipzdGFuZGFyZCBkZXZpYXRpb24qKiwgKiptZWRpYW4qKiwgKiptaW4qKiBhbmQgKiptYXgqKjoNCmBgYHtyfQ0KVGl0YW5pY18wMyA9IGZpbHRlcihUaXRhbmljLCAhaXMubmEoRmFyZSkpDQoNCnN1bW1hcml6ZShUaXRhbmljXzAzLCBNZWFuX0ZhcmUgPSBtZWFuKEZhcmUpLCANCiAgICAgICAgICAgICAgICAgICAgICBTRF9GYXJlID0gc2QoRmFyZSksIA0KICAgICAgICAgICAgICAgICAgICAgIE1lZGlhbl9GYXJlID0gbWVkaWFuKEZhcmUpLCANCiAgICAgICAgICAgICAgICAgICAgICBNaW5fRmFyZSA9IG1pbihGYXJlKSwgDQogICAgICAgICAgICAgICAgICAgICAgTWF4X0ZhcmUgPSBtYXgoRmFyZSkpDQpgYGANCjxicj4NCg0KIyMgWyU+JV0oaHR0cHM6Ly93d3cuZGF0YWNhbXAuY29tL2NvbW11bml0eS90dXRvcmlhbHMvcGlwZS1yLXR1dG9yaWFsKQ0KIyMjICpDb250cm9sIG9mIHRoZSBjb2RlIGZsb3cqDQo8YnI+DQoNCiogVGhlIGF2ZXJhZ2UgYWdlIG9mIHRoZSBwYXNzZW5nZXJzIHRyYXZlbGxpbmcgaW4gdGhlIHRoaXJkIGNsYXNzIGFuZCBkaWQgbm90IHN1cnZpdmU6DQpgYGB7cn0NClRpdGFuaWMgJT4lDQogICAgICAgIGZpbHRlcihQY2xhc3MgPT0gMywgDQogICAgICAgICAgICAgICBTdXJ2aXZlZCA9PSAwKSAlPiUNCiAgICAgICAgc3VtbWFyaXplKEFnZV9EaWVkX0F2ZyA9IG1lYW4oQWdlLCBuYS5ybSA9IFRSVUUpKQ0KYGBgDQo8YnI+DQoNCiogVGhlICoqaGlnaGVzdCoqIHRpY2tldCBwcmljZSBpbiB0aGUgZmlyc3QgZm9yIHBhc3NlbmdlciBieSBuYW1lICdXaWxsaWFtJyB3aG8gc3Vydml2ZWQ6DQoNCmBgYHtyfQ0KVGl0YW5pYyAlPiUNCiAgICAgICAgZmlsdGVyKFBjbGFzcyA9PSAxLA0KICAgICAgICAgICAgICAgc3RyX2RldGVjdChOYW1lLCAiV2lsbGlhbSIpLA0KICAgICAgICAgICAgICAgU3Vydml2ZWQgPT0gMSkgJT4lDQogICAgICAgIHN1bW1hcml6ZShGYXJlX0F2Z19XaWxsID0gbWluKEZhcmUpKQ0KYGBgDQoNCiMjIFtncm91cF9ieV0oaHR0cHM6Ly9kcGx5ci50aWR5dmVyc2Uub3JnL3JlZmVyZW5jZS9ncm91cF9ieS5odG1sKQ0KIyMjICpHcm91cCBieSBvbmUgb3IgbW9yZSB2YXJpYWJsZXMqDQo8YnI+DQoNCiogVGhlIGF2ZXJhZ2UgYWdlIG9mIHRoZSBwYXNzZW5nZXJzIGluIHRoZSB0aGlyZCBjbGFzcyB3aG8gZGlkIG5vdCBzdXJ2aXZlOg0KYGBge3J9DQpUaXRhbmljICU+JQ0KICAgICAgICAgIGdyb3VwX2J5KFNleCkgJT4lDQogICAgICAgICAgc3VtbWFyaXplKE1lYW5fRmFyZSA9IG1lYW4oRmFyZSwgbmEucm0gPSBUUlVFKSwgDQogICAgICAgICAgICAgICAgICAgIENvdW50ID0gbigpLCANCiAgICAgICAgICAgICAgICAgICAgTWVhbl9BZ2UgPSBtZWFuKEFnZSwgbmEucm0gPSBUUlVFKSkNCmBgYA0KDQojIyBSZXNvdXJjZXMNCg0KKiAjIyMgW0RhdGEgV3JhbmdsaW5nIHdpdGggZHBseXIgYW5kIHRpZHlyIENoZWF0IFNoZWV0XShodHRwczovL3JzdHVkaW8uY29tL3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDE1LzAyL2RhdGEtd3JhbmdsaW5nLWNoZWF0c2hlZXQucGRmKQ==