# REGRESSION BASICS

# Loading packages #####
#"pacman" is a package that makes loading easier
if (!require("pacman")) install.packages("pacman")
pacman::p_load(datasets, ggplot2, faraway, readxl, 
               HSAUR, MASS,lmtest,sandwich)

# Basic linear regression #####
# How does a braking distance depend on the car's speed?

#Plot the relationship
plot(cars$speed,cars$dist)

#Try to get familiar with ggplot2 library.
# It's ability to produce beautiful graphs is unmatched.
# This is the one book you want to look into: https://ggplot2-book.org
ggplot(data=cars,aes(x=speed,y=dist)) + geom_point()

#run a simple straight line through the data
lmod <- lm(dist ~ speed, data=cars)
summary(lmod)

#there are many different lines that we can fit
ggplot(data=cars,aes(x=speed,y=dist)) +
  geom_point() +
  geom_abline(slope=3, intercept=10, col="grey") +
  geom_abline(slope=5, intercept=-30, col="grey") +
  geom_abline(slope=8, intercept=-80, col="grey") + 
  geom_abline(slope=lmod$coeff[2], intercept=lmod$coeff[1], col="blue")


# or make use of built in options in ggplot2 library
ggplot(data=cars,aes(x=speed,y=dist)) + 
  geom_point() +
  geom_smooth(method='lm', formula= y~x, se=FALSE)

# Returns to education #####
#
# Data downloaded from https://davidcard.berkeley.edu/data_sets.html
# from paper: 
# "Using Geographic Variation in College Proximity to Estimate the Return to Schooling". 
#   In L.N. Christofides, E.K. Grant, and R. Swidinsky, editors, Aspects of Labor Market 
#   Behaviour: Essays in Honour of John Vanderkamp. Toronto: University of Toronto Press, 1995.
# Data also here: https://docs.google.com/spreadsheets/d/116KwP9SJUQ8LZHeIH-sTvlFFwCgkKIsz86exk1I7O1s/edit#gid=943363882
#  save as card.xlsx
# Inspired by Problem 4 from https://sites.google.com/view/microeconometricswithr/problems/problems-ch1

setwd("/Users/lukaslaffers/Dropbox/econx phd/code")
# this uses xlsx package
data_card <- read_xlsx("card.xlsx")

lwage <- as.numeric(data_card$lwage76)
educ <- as.numeric(data_card$ed76)
age  <- as.numeric(data_card$age76)
# lived in south
south <- as.numeric(data_card$south66)
# define experience
expi  <- age - educ + 6


#create a dataframe
df <- data.frame(cbind(lwage,educ,age,expi,south))

#plot the date
plot(educ,lwage)
summary(lm(lwage ~ educ))

#plot it together
ggplot(aes(x=educ, y=lwage), data=df) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x) +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2), col="red")

#adding age does not change our paramater of interest much
summary(lm(lwage ~ educ + age))

#plot lwage against experience
plot(expi,lwage)
summary(lm(lwage ~ expi))
#not significant

#visual inspection confirms this
ggplot(aes(x=expi, y=lwage), data=df) + 
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x) +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2), col="red")


# why is the coefficient of educ so wildly different?
summary(lm(lwage ~ educ + expi))

#they are highly correlated
cor(educ,expi)


#and what is going on here??
summary(lm(lwage ~ educ + age + expi))


# Interaction terms #####
# ...precious example continued
ggplot(aes(x=educ, y=lwage), data=df) + 
  geom_point(aes(colour=factor(south)))

#you may want to add some noise in order to avoid overplotting
ggplot(aes(x=educ, y=lwage, colour=factor(south)), data=df) + 
  geom_jitter(width = 0.5, height = 0)

# is the association between education and wages different in south than it is in north?
ggplot(aes(x=educ, y=lwage, colour=factor(south)), data=df) + 
  geom_jitter(width = 0.5, height = 0) +
  geom_smooth(method = "lm", formula = y ~ x)

#overall
lm(lwage ~ educ, data=df)$coeff[2]
#5.21%

#south only
lm(lwage ~ educ, data=df %>% filter(south==1))$coeff[2]
#5.69%

#north only
lm(lwage ~ educ, data=df %>% filter(south==0))$coeff[2]
#3.18%

#regression with interaction
lm(lwage ~ educ*south, data=df)
#3.19 + 2.50 = 5.69
# regression is the same as running the two regressions separately


#how about education x age interaction
# is the association between wages and education different for old/young
ggplot(aes(x=educ, y=lwage, colour=factor(age>median(age))), data=df) + 
  geom_jitter(width = 0.5, height = 0) +
  geom_smooth(method = "lm", formula = y ~ x)
# intercept does not appear to be very different
summary(lm(lwage ~ educ*I(age>28), data=df))
# and this is confirmed by this regression
#interaction term is 2.4%, so the association between educ and lwages 
#is stronger for older workers (as seen on the figure)


# by dichotomizing age into two categories we lose information
#let's make use of the full information here
summary(lm(lwage ~ educ*age, data=df))
#this is somewhat difficult to interpret
# there isn't a person who is 0years old
mean(df$age)
df$agec <- age-28
summary(lm(lwage ~ educ*agec, data=df))
#here education coefficient is the association for someone who is 28years old
# interaction term stays the same, R2 stays the same (do you see why?)
# how about the age coefficient?

df$educc <- df$educ - 12
summary(lm(lwage ~ educc*agec, data=df))
#now we can interpret age coefficient.


# Pearl's Simpson machine #####
# Source: http://dagitty.net/learn/simpson/index.html
#  adding variables changes the model's structure.
#  Consecutively adding variables may change paramater signs in every step!!!
set.seed(89374)

simpson.simulator <- function(N,s,ce){
  Z1 <- rnorm(N,0,s)
  Z3 <- rnorm(N,0,s) + Z1
  U <- rnorm(N,0,s) + Z1
  Z2 <- rnorm(N,0,s) + Z3 + U
  X <- rnorm(N,0,s) + U
  Y <- rnorm(N,0,s) + ce*X + 10*Z3
  data.frame(Y,X,Z1,Z2,Z3)
}

# 1st parameter: sample size
# 2nd parameter: noise standard deviation
# 3rd parameter: true causal effect
D <- simpson.simulator(1000,0.01,1)
# true effect is 1

# unadjusted estimate
m <- lm(D[,1:2])
summary(m)
confint(m,'X')

# adjusted for {Z1}
m <- lm(D[,c(1,2,3)])
summary(m)
confint(m,'X')
# coeff is about 1

# adjusted for {Z1,Z2}
m <- lm(D[,c(1,2,3,4)])
summary(m)
confint(m,'X')
# coeff is about -1

# adjusted for {Z1,Z2,Z3}
m <- lm(D[,c(1,2,3,4,5)])
summary(m)
confint(m,'X')
# coeff is about 1


# Heteroskedasticity robust standard errors #####

#generate a random dataset
set.seed(930)
n  <- 1000
x1 <- rnorm(n)
y  <- 1 + 3*x1 + x1*rnorm(n)
lmod <- lm(y ~ x1)
summary(lmod)
# look at the residual plot, there clearly is a pattern
plot(lmod,1)

# Here we show how the HR standard errors can be calculated
#get the estimated coefficients
hatbeta <- as.numeric(lmod$coeff)

#matrix X
x <- cbind(rep(1,n),x1)

#hat \epsilon
e <- y - x %*% hatbeta

#X^T X
xx <- t(x)%*%x

#(X^T X)^{-1}
xxi <- solve(xx)

#(X^T X)^{-1} X^T y
xxi %*% t(x) %*% y
#is the same as hatbeta

#calculate the leverage
leverage <- 1 - diag(diag(n) - x %*% xxi %*% t(x))

#calculation of the different standard errors
# (source: modified Hansen Econx p144)
p <- ncol(x)
a <- n/(n-p)
sig2 <- as.numeric( (t(e) %*% e)/(n-p) )
u1 <- x*(e%*%matrix(1,1,p))
u2 <- x*((e/sqrt(1-leverage))%*%matrix(1,1,p)) 
u3 <- x*((e/(1-leverage))%*%matrix(1,1,p))
xx <- solve(t(x)%*%x)
v0 <- xx* sig2
v1 <- xx %*% (t(u1)%*%u1) %*% xx 
v1a <- a * xx %*% (t(u1)%*%u1) %*% xx 
v2 <- xx %*% (t(u2)%*%u2) %*% xx
v3 <- xx %*% (t(u3)%*%u3) %*% xx

# Homoskedastic formula
( s0 <- sqrt(diag(v0)) )

#HCO
( s1 <- sqrt(diag(v1)) ) 

#HC1
( s1a <- sqrt(diag(v1a)) ) 

#HC2
( s2 <- sqrt(diag(v2)) )

#HC3
( s3 <- sqrt(diag(v3)) )


#Simpler way is to use coeftest from lmtest library

# Homoskedastic formula
coeftest(lmod)

#HCO
coeftest(lmod, vcov = sandwich)
coeftest(lmod, vcov = vcovHC(lmod, "HC0"))

#HC1 #default in Stata: "reg y x, robust"
coeftest(lmod, vcov = vcovHC(lmod, "HC1")) 

#HC2
coeftest(lmod, vcov = vcovHC(lmod, "HC2")) 

#HC3
coeftest(lmod, vcov = vcovHC(lmod, "HC3")) 


# Diagnostics #####

# correct model
x <- runif(100, 0, 10)
y <- 1 + 2 * x + rnorm(100, 0, 1)
m <- lm(y ~ x)
par(mfrow = c(2, 2))
plot(m)

# some wrong model
y <- 1 + 2 * x + 1 * x^2 - 0.5 * x^3
m <- lm(y ~ x)
par(mfrow = c(2, 2))
plot(m)

# Model Building #####
#
# Some high-school Physics first:
#
# [kinetic energy] = 0.5 * weight * speed^2
# [work done by braking] = mu * weight * g * distance
# mu - coefficient of friction
# g - standard acceleration of free fall
#
# [kinetic energy] = [work done by braking]
# leads to
# distance = (1/(2*mu*g)) * speed^2 = c * speed^2


lmod2 <- lm(dist ~ speed + I(speed^2), data=cars)
summary(lmod2)

lmod3 <- lm(dist ~ I(speed^2), data=cars)
summary(lmod3)

lmod4 <- lm(dist ~ I(speed^2) - 1, data=cars)
summary(lmod4)

#Be careful, removing the intercept changes the interpretation of R^2
ggplot(data=cars,aes(x=speed,y=dist)) + 
  geom_point() +
  geom_smooth(method='lm', formula= y~x, se=FALSE) +
  geom_smooth(method='lm', formula= y~x + I(x^2), se=FALSE, col="red")+
  geom_smooth(method='lm', formula= y~I(x^2), se=FALSE, col="green") + 
  geom_smooth(method='lm', formula= y~I(x^2) - 1, se=FALSE, col="black")

#Which one would you pick??
lmod5 <- lm(log(dist) ~ log(speed), data=cars)
summary(lmod5)
# the exponent is 1.602 not 2 though

# Fit is certainly not the only criterion to pick a model
ggplot(data=cars,aes(x=speed,y=dist)) + 
  geom_point() +
  geom_smooth(method='lm', 
              formula= y~x+I(x^2)+I(x^3)+I(x^4)+
                I(x^5)+I(x^6)+I(x^7),
              se=FALSE)
# do you see why?


# Model Selection #####
# Freedman, David A., and David A. Freedman. "A note on screening regression equations." The American Statistician 37.2 (1983): 152-155.

#set random seed, so that the results are reproducible
set.seed(32413)
#try different seeds, you may be more lucky
#set.seed(32414)

#create 51 columns of noise (iid N(0,1))
DAT <- matrix(rnorm(100*51),nrow=100,ncol=51)

#save them into y and X
y <- DAT[,1]
X <- DAT[,-1]

#run a regression
summary(lm(y ~ X - 1))

#run a linear regression
sum_table <- summary(lm(y ~ X - 1))

#how many variables have p-values <0.25
sum(sum_table$coeff[,4]<0.25)

#how many variables have p-values <0.05
sum(sum_table$coeff[,4]<0.05)

#now pick only those variables that had p<0.25
# (minus one because of the constant term)
X2 <- X[,which(sum_table$coeff[,4]<0.25)]

#again, run a linear regression
summary(lm(y ~ X2 - 1))

#Remember, we attempted to explain noise by noise!
# those numbers may look indeed quite convincing!
# Lesson to take: it is risky to rely on automatic
#  model selection techniques. Model building requires skill
#  and subject matter expertise


# Sample size #####
set.seed(54634)

nObs <- 10000000
x   <- rnorm(nObs)
eps <- rnorm(nObs)
y <- 1 + 0.001*x + eps
summary(lm(y ~ x))

nSim <- 1000

resFew <- rep(NA,nSim)
resMid <- rep(NA,nSim)
resMany <- rep(NA,nSim)

nObsFew <- 10000
nObsMid <- 5000
nObsMany <- 100000
for (iSim in 1:nSim){
  print(iSim)
  
  x   <- rnorm(nObsFew)
  eps <- rnorm(nObsFew)
  y <- 1 + 0.01*x + eps
  resFew[iSim] <- lm(y ~ x)$coeff[2]
  
  x   <- rnorm(nObsMid)
  eps <- rnorm(nObsMid)
  y <- 1 + 0.01*x + eps
  resMid[iSim] <- lm(y ~ x)$coeff[2]
  
  x   <- rnorm(nObsMany)
  eps <- rnorm(nObsMany)
  y <- 1 + 0.01*x + eps
  resMany[iSim] <- lm(y ~ x)$coeff[2]
}

DF <- data.frame(simNumber = c(1:nSim,1:nSim,1:nSim),
                 coeff = c(resMany,resMid,resFew), 
                 yy=c(rep("Many",nSim),rep("Mid",nSim),rep("Few",nSim)) )

ggplot(DF, aes(x=coeff)) +
  geom_histogram(data=subset(DF,yy == 'Many'),fill = "blue", alpha = 0.2) +
  #geom_histogram(data=subset(DF,yy == 'Mid'),fill = "green", alpha = 0.2) +
  geom_histogram(data=subset(DF,yy == 'Few'), fill = "red", alpha = 0.2)

ggplot(DF, aes(x=coeff)) +
  geom_density(data=subset(DF,yy == 'Many'),fill = "blue", alpha = 0.2) +
  #geom_density(data=subset(DF,yy == 'Mid'),fill = "green", alpha = 0.2) +
  geom_density(data=subset(DF,yy == 'Few'), fill = "red", alpha = 0.2)


# Principal Components Analysis #####
# Example - Heptathlon
data("heptathlon", package = "HSAUR")

#scoring system - source wiki: https://en.wikipedia.org/wiki/Heptathlon
# hurdles highjump  shot run200m longjump javelin run800m
coeffs = matrix(c(9.23076, 26.7, 1.835, #hurdles
                  1.84523,   75, 1.348, #highjump
                  56.0211,  1.5, 1.05,  #shot put
                  4.99087, 42.5, 1.81,  #run200m
                  0.188807, 210, 1.41,  #long jump
                  15.9803,  3.8, 1.04,  #javelin
                  0.11193,  254, 1.88   #run800m
),3,7,byrow=FALSE)

P1 <- coeffs[1,1]*(coeffs[2,1] - heptathlon$hurdles)^coeffs[3,1] #hurdles
P4 <- coeffs[1,4]*(coeffs[2,4] - heptathlon$run200m)^coeffs[3,4] #run200m
P7 <- coeffs[1,7]*(coeffs[2,7] - heptathlon$run800m)^coeffs[3,7] #run800m

P2 <- coeffs[1,2]*(heptathlon$highjump*100 - coeffs[2,2])^coeffs[3,2] #highjump
P5 <- coeffs[1,5]*(heptathlon$longjump*100 - coeffs[2,5])^coeffs[3,5] #long jump

P3 <- coeffs[1,3]*(heptathlon$shot   - coeffs[2,3])^coeffs[3,3]  #shot put
P6 <- coeffs[1,6]*(heptathlon$javelin - coeffs[2,6])^coeffs[3,6] #javelin

P <- P1 + P2 + P3 + P4 + P5 + P6 + P7
round(P)
heptathlon$score
#not quite, but perhaps it has changed since 1988?


#recode the variables so that more is better
heptathlon$hurdles <- max(heptathlon$hurdles) - heptathlon$hurdles
heptathlon$run200m <- max(heptathlon$run200m) - heptathlon$run200m
heptathlon$run800m <- max(heptathlon$run800m) - heptathlon$run800m

#look at the correlation matrix
score_index <- which(colnames(heptathlon) == "score")
round(cor(heptathlon[,-score_index]), 2)

#perform pca
heptathlon_pca <- prcomp(heptathlon[, -score_index], scale = TRUE)
print(heptathlon_pca)
summary(heptathlon_pca)


#plot the pairwise scatterplots
plot(heptathlon[,-8])

#first PC for competitors?
center <- heptathlon_pca$center
scale <- heptathlon_pca$scale
hm <- as.matrix(heptathlon[,-score_index])
drop(scale(hm, center = center, scale = scale) %*% heptathlon_pca$rotation[,1])

#or simply
predict(heptathlon_pca)[,1]

#how is score correlated with the first PC?
cor(heptathlon$score, heptathlon_pca$x[,1])
#well ok
plot(heptathlon$score, heptathlon_pca$x[,1])
#not bad
#how to we interpret it?

plot(heptathlon_pca)
biplot(heptathlon_pca, col = c("gray", "black"))