#Uploading the data.
install.packages("gapminder")
library(gapminder)

#Changing format of the data to data frame.
data<-as.data.frame(gapminder)

#1. From the data set "data" extract information you need and save it as "gapdata":
# - you need only data of 2007 and all the continents except "Oceania" (instead of 
#the including "==" operator that can help to select continents you need, you can choose the excluding"!=" operator that is opposite, when you don't
#want to include "Oceania" only).
# - you need only columns continent and lifeExp.
library(tidyverse)
gapdata<-data %>%
  filter (year==2007 & continent!="Oceania") %>%
  select(continent, lifeExp)

#2. Formulate your research question and null hypothesis, what would you like to test there?

#3. Check the size of your sample. Is it large enough?
str(gapdata)

#4. Check normality of the numerical variable. Use both analytical and graphical ways. 
#Do you meet the normality assumption?

hist(gapdata$lifeExp, breaks=20) #not normally distributed
qqnorm (gapdata$lifeExp)
qqline(gapdata$lifeExp, col = "red", lwd=3)
ks.test(gapdata$lifeExp, "pnorm", mean(gapdata$lifeExp), sd (gapdata$lifeExp))
shapiro.test(gapdata$lifeExp)

#5. Check normality of each group in the categorical variable.
#Do you meet the normality assumption?
for (i in unique(gapdata$continent)){
  par(mfcol = c(1,2))
  hist(gapdata$lifeExp[which(gapdata$continent==i)], breaks=20) 
  qqnorm (gapdata$lifeExp[which(gapdata$continent==i)])
  qqline(gapdata$lifeExp[which(gapdata$continent==i)], col = "red", lwd=3)
  print(ks.test(gapdata$lifeExp[which(gapdata$continent==i)], "pnorm", 
                mean(gapdata$lifeExp[which(gapdata$continent==i)]), sd (gapdata$lifeExp[which(gapdata$continent==i)])))
  print(shapiro.test(gapdata$lifeExp[which(gapdata$continent==i)]))
  
}

par(mfcol = c(1,1))

#6. We failed the normality assumption. Which test should you perform instead of ANOVA?
# Which assumption is needed to be met? Check this assumption.

#distribution shapes of the groups should not differ much from each other.
par(mfcol = c(2,2))
for (i in unique(gapdata$continent)){
  hist(gapdata$lifeExp[which(gapdata$continent==i)], breaks=20) 
}
par(mfcol = c(1,1))

#7. Perform the test. What conclusion can you make? Do you reject or accept the null hypothesis?

#Kruskal-wallis
kruskal.test(gapdata$lifeExp ~ gapdata$continent)
#There is a significant difference in life expectancy in 2007 between the continents (p-value<0.001).

#8. You want to check between which groups this difference is significant using a Post-hoc test. Which test should you perform to check it?
# Perform the test. Build the boxplot. Does the boxplot demonstrate the Dunn's test output?

#Dunn's test
install.packages("dunn.test")
library(dunn.test)
dunn.test(gapdata$lifeExp,gapdata$continent, altp=TRUE)
boxplot(gapdata$lifeExp ~ gapdata$continent)

#9. If you have time, perform all the steps but instead of life expectancy choose the population ("pop") variable.

gapdata<-data %>%
  filter (year==2007 & continent!="Oceania") %>%
  select(continent, pop)


hist(gapdata$pop, breaks=20) #not normally distributed
qqnorm (gapdata$pop)
qqline(gapdata$pop, col = "red", lwd=3)
ks.test(gapdata$pop, "pnorm", mean(gapdata$pop), sd (gapdata$pop))
shapiro.test(gapdata$pop)

for (i in unique(gapdata$continent)){
  par(mfcol = c(1,2))
  hist(gapdata$pop[which(gapdata$continent==i)], breaks=20) 
  qqnorm (gapdata$pop[which(gapdata$continent==i)])
  qqline(gapdata$pop[which(gapdata$continent==i)], col = "red", lwd=3)
  print(ks.test(gapdata$pop[which(gapdata$continent==i)], "pnorm", mean(gapdata$pop[which(gapdata$continent==i)]), sd (gapdata$pop[which(gapdata$continent==i)])))
  print(shapiro.test(gapdata$pop[which(gapdata$continent==i)]))
  
}

par(mfcol = c(1,1))

par(mfcol = c(2,2))
for (i in unique(gapdata$continent)){
  hist(gapdata$pop[which(gapdata$continent==i)], breaks=20) 
}
par(mfcol = c(1,1))

#Kruskal-wallis
kruskal.test(gapdata$pop ~ gapdata$continent)

#Dunn's test
library(dunn.test)
dunn.test(gapdata$pop, gapdata$continent, altp=TRUE)
boxplot(log10(gapdata$pop) ~ gapdata$continent)