# install.packages("pdftools") # mozna bude nutne doinstalovat libpoppler viz radek nize # sudo apt-get install libpoppler-cpp-dev library(pdftools) # install.packages("glue") library(glue) # install.packages("tidyverse") library(tidyverse) # install.packages("ggthemes") library(ggthemes) # install.packages("XML") library(XML) # install.packages("RCurl") library(RCurl) # install.packages("ggmap") library(ggmap) ################################### # PDF ################################### country <- c("cze", "svk", "deu", "pol", "aut", "hun") url <- "http://www.who.int/diabetes/country-profiles/{country}_en.pdf?ua=1" (urls <- glue(url)) pdf_names <- glue("report_{country}.pdf") walk2(urls, pdf_names, download.file, mode = "wb") raw_text <- map(pdf_names, pdf_text) # table <- raw_text[1] clean_table <- function(table){ table <- str_split(table, "\n", simplify = TRUE) country_name <- table[1, 1] %>% stringr::str_squish() %>% stringr::str_extract(".+?(?=\\sTotal)") table_start <- stringr::str_which(table, "Prevalence of diabetes") table_end <- stringr::str_which(table, "National response to diabetes") table <- table[1, (table_start +1 ):(table_end - 1)] table <- str_replace_all(table, "\\s{2,}", "|") text_con <- textConnection(table) data_table <- read.csv(text_con, sep = "|") colnames(data_table) <- c("Condition", "Males", "Females", "Total") dplyr::mutate(data_table, Country = country_name) } diabetes <- map_df(raw_text, clean_table) %>% gather(Sex, Share, Males, Females, Total) %>% mutate(Share = as.numeric(str_extract(Share, "\\d{1,}\\.\\d{1,}"))) # bar plot ggplot(diabetes) + theme_economist() + scale_fill_hc() + geom_bar(aes(y = Share, x = Sex, fill = Country), stat = "identity", position = "dodge") + facet_wrap(~Condition) # map country_full_name <- c("Czech republic", "Slovakia", "Germany", "Poland", "Austria", "Hungary") map <- map_data("world", region=country_full_name) %>% left_join(diabetes %>% filter(Condition == "Diabetes" & Sex == "Total"), by=c("region" = "Country")) country_center <- map %>% group_by(region) %>% summarise(long = mean(long), lat = mean(lat)) ggplot(map, aes(x=long, y=lat)) + geom_polygon(aes(group=group, fill=Share), color="white")+ geom_text(aes(label=region), data=country_center, size=3, hjust=0.5)+ theme_void() + scale_fill_viridis_c() + theme(legend.position="bottom") ################################### # Web ################################### allthesongs <- data.frame() for (i in 1965:2018) { # create the URL for each year URL <- paste("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_",i,sep="") # parse the HTML results <- htmlTreeParse(getURL(URL, followlocation=TRUE), useInternal=TRUE) billboard_text <- xpathSApply(results, "//table[@class='wikitable sortable']//tr",xmlValue) split_billboard_text <- str_split_fixed(billboard_text,"\n",3) billboard <- as.data.frame(cbind(split_billboard_text[2:101, ], rep(i,100)), stringsAsFactors=FALSE) # row bind this year's data to all the data allthesongs <- rbind(allthesongs, billboard) } colnames(allthesongs) <- c("Rank", "Song", "Artist", "Year") allthesongs$Song <- gsub('\\"', "", allthesongs$Song) allthesongs$Song <- tolower(gsub("[^[:alnum:] ]", "", allthesongs$Song)) allthesongs$Song <- gsub("\\'", "", iconv(allthesongs$Song, to='ASCII//TRANSLIT')) # fix special accent chars allthesongs$Artist <- tolower(gsub("[^[:alnum:] ]", "", allthesongs$Artist)) allthesongs$Artist <- gsub("'e", "e", iconv(allthesongs$Artist, to='ASCII//TRANSLIT')) # fix special accent chars allthesongs$Artist<- gsub("'o", "o", allthesongs$Artist) allthesongs <- as_tibble(allthesongs)