#Uploading the data. install.packages("gapminder") library(gapminder) #Changing format of the data to data frame. data<-as.data.frame(gapminder) #1. From the data set "data" extract information you need and save it as "gapdata": # - you need only data of 2007 and all the continents except "Oceania" (instead of #the including "==" operator that can help to select continents you need, you can choose the excluding"!=" operator that is opposite, when you don't #want to include "Oceania" only). # - you need only columns continent and lifeExp. library(tidyverse) gapdata<-data %>% filter (year==2007 & continent!="Oceania") %>% select(continent, lifeExp) #2. Formulate your research question and null hypothesis, what would you like to test there? #3. Check the size of your sample. Is it large enough? str(gapdata) #4. Check normality of the numerical variable. Use both analytical and graphical ways. #Do you meet the normality assumption? hist(gapdata$lifeExp, breaks=20) #not normally distributed qqnorm (gapdata$lifeExp) qqline(gapdata$lifeExp, col = "red", lwd=3) ks.test(gapdata$lifeExp, "pnorm", mean(gapdata$lifeExp), sd (gapdata$lifeExp)) shapiro.test(gapdata$lifeExp) #5. Check normality of each group in the categorical variable. #Do you meet the normality assumption? for (i in unique(gapdata$continent)){ par(mfcol = c(1,2)) hist(gapdata$lifeExp[which(gapdata$continent==i)], breaks=20) qqnorm (gapdata$lifeExp[which(gapdata$continent==i)]) qqline(gapdata$lifeExp[which(gapdata$continent==i)], col = "red", lwd=3) print(ks.test(gapdata$lifeExp[which(gapdata$continent==i)], "pnorm", mean(gapdata$lifeExp[which(gapdata$continent==i)]), sd (gapdata$lifeExp[which(gapdata$continent==i)]))) print(shapiro.test(gapdata$lifeExp[which(gapdata$continent==i)])) } par(mfcol = c(1,1)) #6. We failed the normality assumption. Which test should you perform instead of ANOVA? # Which assumption is needed to be met? Check this assumption. #distribution shapes of the groups should not differ much from each other. par(mfcol = c(2,2)) for (i in unique(gapdata$continent)){ hist(gapdata$lifeExp[which(gapdata$continent==i)], breaks=20) } par(mfcol = c(1,1)) #7. Perform the test. What conclusion can you make? Do you reject or accept the null hypothesis? #Kruskal-wallis kruskal.test(gapdata$lifeExp ~ gapdata$continent) #There is a significant difference in life expectancy in 2007 between the continents (p-value<0.001). #8. You want to check between which groups this difference is significant using a Post-hoc test. Which test should you perform to check it? # Perform the test. Build the boxplot. Does the boxplot demonstrate the Dunn's test output? #Dunn's test install.packages("dunn.test") library(dunn.test) dunn.test(gapdata$lifeExp,gapdata$continent, altp=TRUE) boxplot(gapdata$lifeExp ~ gapdata$continent) #9. If you have time, perform all the steps but instead of life expectancy choose the population ("pop") variable. gapdata<-data %>% filter (year==2007 & continent!="Oceania") %>% select(continent, pop) hist(gapdata$pop, breaks=20) #not normally distributed qqnorm (gapdata$pop) qqline(gapdata$pop, col = "red", lwd=3) ks.test(gapdata$pop, "pnorm", mean(gapdata$pop), sd (gapdata$pop)) shapiro.test(gapdata$pop) for (i in unique(gapdata$continent)){ par(mfcol = c(1,2)) hist(gapdata$pop[which(gapdata$continent==i)], breaks=20) qqnorm (gapdata$pop[which(gapdata$continent==i)]) qqline(gapdata$pop[which(gapdata$continent==i)], col = "red", lwd=3) print(ks.test(gapdata$pop[which(gapdata$continent==i)], "pnorm", mean(gapdata$pop[which(gapdata$continent==i)]), sd (gapdata$pop[which(gapdata$continent==i)]))) print(shapiro.test(gapdata$pop[which(gapdata$continent==i)])) } par(mfcol = c(1,1)) par(mfcol = c(2,2)) for (i in unique(gapdata$continent)){ hist(gapdata$pop[which(gapdata$continent==i)], breaks=20) } par(mfcol = c(1,1)) #Kruskal-wallis kruskal.test(gapdata$pop ~ gapdata$continent) #Dunn's test library(dunn.test) dunn.test(gapdata$pop, gapdata$continent, altp=TRUE) boxplot(log10(gapdata$pop) ~ gapdata$continent)