#1. Upload the "salary_data.csv" data set.
salary<-read.csv("salary_data.csv")

#2. Build the plot to look at the relationship between the variables. 
#What will be the dependent variable (outcome), what will be the independent variable (predictor)?
plot(salary$Productivity ~ salary$Salary, pch=16, col="darkblue")

#3. Perform linear regression analysis (fit a simple linear regression model between the variables).
#Draw the best-fit regression line.
model<-lm(salary$Productivity ~ salary$Salary)
abline(model, col="red", lwd=3)

#4. Check the main assumptions of the model, use the four main plots for checking:
#Plot 1.Linearity of the data, independence of residuals
#Plot 2.Normality of residuals using Q-Q plot 
#Plot 3.Constant variance of residuals 
#Plot 4. No influential outliers
par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

#5. Check the assumption "Normality of residuals" using histogram and normality tests; and "Zero mean of residuals". Don't forget to look at the Q-Q plot from the previous question.
hist(model$residuals, breaks=20)
mean(model$residuals) #Interpret the number
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

#6. Obtain parameters of the regression (the intercept, the slope of the line); check the significance. Fill up the check list.
summary(model)

#7. Obtain criteria for the model evaluation (Adjusted R-squared, RSE, AIC, the 95% confidence intervals). Fill up the check list.
summary(model)
confint(model)
AIC(model)

#8. After checking all the assumptions, what conclusion can you make?  

#9. Take away the outlier (number 5 on the previous plots) that has a high influence on the regression line. 
#To identify the outlier, first look at the histograms of the variables.
hist(salary$Productivity, breaks=20)
hist(salary$Salary, breaks=20)

#10. Delete the outlier using the tidyverse package. 
library(tidyverse)
salary<- salary %>%
  filter(Productivity>30)

#11. Now when you have data without the outlier, fit an adjusted simple linear regression model (repeat steps 2-7).

plot(salary$Productivity ~ salary$Salary, pch=16, col="darkblue")
model<-lm(salary$Productivity ~ salary$Salary)
abline(model, col="red", lwd=3)

par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

hist(model$residuals, breaks=20)
mean(model$residuals) #Interpret the number
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

summary(model)
confint(model)
AIC(model)

#12. Fill up the check list, compare the models, choose a better model and make your conclusions.