#1. Upload the "employment.csv" data set.
employ<-read.csv("employment.csv")

#2. Build the plot to look at the relationship between the variables. 
#What will the dependent variable (outcome), what will be the independent variable (predictor)?
plot(employ$Employment ~ employ$Inflation, pch=16, col="darkblue")

#3. Perform linear regression analysis (Fit a simple linear regression model between the variables).
#Draw the best-fit regression line.
model<-lm(employ$Employment ~ employ$Inflation)
abline(model, col="red", lwd=3)

#4. Check the main assumptions of the model, use the four main plots for checking:
#Plot 1.Linearity of the data, independence of residuals
#Plot 2.Normality of residuals using Q-Q plot 
#Plot 3.Constant variance of residuals 
#Plot 4. No influential outliers

par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

#5. Check the assumption "Normality of residuals" using histogram and normality tests; and "Zero mean of residuals". Don't forget to look at the Q-Q plot from the previous question.
hist(model$residuals, breaks=20) #+the Q-Q plot from the previous question
mean(model$residuals)
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

#6. Obtain parameters of the regression (the intercept, the slope of the line, the 95% confidence intervals).
summary(model)
confint(model)

#7. Obtain criteria for the model evaluation (Adjusted R-squared, AIC).
summary(model)
AIC(model)

#8. After checking all the assumptions, what conclusion can you make? 

#The assumptions are met. The model and the independent variable (inflation) are significant;
#The inflation variable explains 99.5% of the employment variability. 
#The estimate of the β1 coefficient equals -1.65 (95% CI [-1.68;-1.63]), the intercept α equals 0.03.
# Y=0.03-1.65*X (for each one-unit shift of the inflation the employment decreases by 1.65)

#9.Repeat all the steps for the "bpa_age_data.csv" data set.

bpa<-read.csv("bpa_age_data.csv")

plot(bpa$BPA_Concentration ~ bpa$Age, pch=16, col="darkblue")

model<-lm(bpa$BPA_Concentration ~ bpa$Age)
abline(model, col="red", lwd=3)

par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

hist(model$residuals, breaks=20)
mean(model$residuals)
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

summary(model)
confint(model)
AIC(model)

#The assumptions are met, it a very good example of that. The model and the independent variable (age) are significant;
#The age variable explains 31% of the BPA concentration variability. 
#The estimate of the β1 coefficient equals 0.19 (95% CI [0.14;0.25]), the intercept α equals 5.10.
# Y=5.10+0.19*X (for each one-unit shift of the age (one year) the BPA concentration increases by 0.19 (ug/L))

#10.Repeat all the steps for the "employment_1.csv" data set. 
employ<-read.csv("employment_1.csv")

plot(employ$Employment ~ employ$Inflation, pch=16, col="darkblue")

model<-lm(employ$Employment ~ employ$Inflation)
abline(model, col="red", lwd=3)

par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

hist(model$residuals, breaks=20)
mean(model$residuals)
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

summary(model)
confint(model)
AIC(model)

#The assumptions are NOT met, we can't use linear regression for the relationship quantification between the variables and should 
#use other methods.


#11. Repeat all the steps for the "age.csv" data set. 

age<-read.csv("age.csv")

plot(age$Employment ~ age$Age, pch=16, col="darkblue")

model<-lm(age$Employment ~ age$Age)
abline(model, col="red", lwd=3)

par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))

hist(model$residuals, breaks=20)
mean(model$residuals)
ks.test(model$residuals,"pnorm",mean(model$residuals, na.rm=TRUE),sd(model$residuals, na.rm=TRUE))
shapiro.test(model$residuals)

summary(model)
confint(model)
AIC(model)

#The assumptions are met. However, both the model and the independent variable (age) are not statistically
#significant, we can't use this model.