pigs<-read.csv("pigs_normality.csv")


#Let's look at the structure
str(pigs)
pigs$Bodyweight<-as.numeric(pigs$Bodyweight)
str(pigs)

#Let's check categorical variables
unique(pigs$Gender)
unique(pigs$Diet)

#Let's harmonize it

for (i in 1:nrow(pigs)){
  if (pigs$Diet[i]=="hight fat"){
    pigs$Diet[i]<-"hf"
  }
}

unique(pigs$Diet)

#Let's check numerical variables
summary(pigs$Bodyweight)

library(tidyverse)
pigs<- pigs %>%
       filter (!is.na(Bodyweight))

summary(pigs$Bodyweight)
hist(pigs$Bodyweight)

#I would divide our data set to two. According to the diet types.
unique(pigs$Diet)

#Do we need all rows?
#Do we need all column?
#Why I don' delete the column first?

hf<- pigs %>%
     filter(Diet=="hf") %>%
     select (-Diet)
#I didn't change the pigs data set, but I extracted some data from there and saved it with a new name. 
#But the initial "pigs" data set is left untouched.

chow<- pigs %>%
       filter(Diet=="chow") %>%
       select (-Diet)

#Now let's do some preliminary analysis:
hist(pigs$Bodyweight, breaks=20)
qqnorm(pigs$Bodyweight)
qqline(pigs$Bodyweight, col = "red", lwd=3)


#Let's test its normality
shapiro.test(pigs$Bodyweight)
#kolmogorov-smirnov, it needs 2 variables, x and y. x is our data and y what we compare with - "pnorm" 
#what means normal distribution with mean and sd of our data set.
ks.test(pigs$Bodyweight, "pnorm", mean(pigs$Bodyweight), sd (pigs$Bodyweight))

install.packages('nortest')
library(nortest)
ad.test(pigs$Bodyweight)

#According to all tests it isn't normally distributed.
#Now let's try to transform the data using logarithm. 

pigs_transf<-log10(pigs$Bodyweight)
hist(pigs_transf, breaks=20)

#Let's put two plots together to compare
par(mfcol = c(1,2))
hist(pigs$Bodyweight, breaks=20)
hist(pigs_transf, breaks=20)
par(mfcol = c(1,1))

par(mfcol = c(1,2))
qqnorm(pigs$Bodyweight)
qqline(pigs$Bodyweight, col = "red", lwd=3)
qqnorm(pigs_transf)
qqline(pigs_transf, col = "red", lwd=3)
par(mfcol = c(1,1))

#Let's look at tests now
shapiro.test(pigs_transf)
ks.test(pigs_transf, "pnorm", mean(pigs_transf), sd (pigs_transf))
ad.test(pigs_transf) #still not normal


#Here I would conclude it was slightly skewed and all tests resulted so,
#log-transformation is suitable here.


#Let's perform all the steps for checking distribution for different diets.

#1. High fat diet
#1.1 Build a histogram and QQ plot of body weights for the "hf" diet. Put the plots in one row, 2 columns for a better view. 

par(mfcol = c(1,2))
hist(hf$Bodyweight, breaks=20)
qqnorm(hf$Bodyweight)
qqline(hf$Bodyweight, col = "red", lwd=3)
par(mfcol = c(1,1))

#1.2 Perform three normality tests. 

shapiro.test(hf$Bodyweight)
ks.test(hf$Bodyweight, "pnorm", mean(hf$Bodyweight), sd (hf$Bodyweight))
ad.test(hf$Bodyweight)

#1.3 Transform the data, build two histograms in one row to compare (before and after transformation).

hf_transf<-log10(hf$Bodyweight)
par(mfcol = c(1,2))
hist(hf$Bodyweight, breaks=20)
hist(hf_transf, breaks=20)
par(mfcol = c(1,1))

#1.4 Build two QQ plots in one row to compare (before and after transformation).

par(mfcol = c(1,2))
qqnorm(hf$Bodyweight)
qqline(hf$Bodyweight, col = "red", lwd=3)
qqnorm(hf_transf)
qqline(hf_transf, col = "red", lwd=3)
par(mfcol = c(1,1))

#1.5 Perform again the normality tests but on the transformed data.
#What can you conclude?

shapiro.test(hf_transf)
ks.test(hf_transf, "pnorm", mean(hf_transf), sd (hf_transf))
ad.test(hf_transf)

#2. Now repeat all the steps for the chow diet.

#2.1
par(mfcol = c(1,2))
hist(chow$Bodyweight, breaks=20)
qqnorm(chow$Bodyweight)
qqline(chow$Bodyweight, col = "red", lwd=3)
par(mfcol = c(1,1))

#2.2
shapiro.test(chow$Bodyweight)
ks.test(chow$Bodyweight, "pnorm",mean(chow$Bodyweight), sd (chow$Bodyweight))
ad.test(chow$Bodyweight)

#2.3
chow_transf<-log10(chow$Bodyweight)
par(mfcol = c(1,2))
hist(chow$Bodyweight, breaks=20)
hist(chow_transf, breaks=20)
par(mfcol = c(1,1))

#2.4
par(mfcol = c(1,2))
qqnorm(chow$Bodyweight)
qqline(chow$Bodyweight, col = "red", lwd=3)
qqnorm(chow_transf)
qqline(chow_transf, col = "red", lwd=3)
par(mfcol = c(1,1))

#2.5
shapiro.test(chow_transf)
ks.test(chow_transf, "pnorm",mean(chow_transf), sd (chow_transf))
ad.test(chow_transf)