# Loading packages #####
#"pacman" is a package that makes loading easier
if (!require("pacman")) install.packages("pacman")
pacman::p_load(datasets, ggplot2, faraway, readxl, 
               ivreg, modelsummary, gtsummary, hdm, 
               tidyverse, haven)
setwd("/Users/lukaslaffers/Dropbox/econx phd/code")

# Returns to schooling example #####
# Endogenous education is instrumented by nearby college dummy
#https://cran.r-project.org/web/packages/ivreg/vignettes/ivreg.html
data("SchoolingReturns", package = "ivreg")
summary(SchoolingReturns[, 1:8])

#run OLS regression
m_ols <- lm(log(wage) ~ education + poly(experience, 2) + ethnicity + smsa + south,
            data = SchoolingReturns)
summary(m_ols)
#nice, so many stars!

#run IV regression, use nearcollege as an instrument
m_iv <- ivreg(log(wage) ~ education + poly(experience, 2) + ethnicity + smsa + south |
                nearcollege + poly(age, 2) + ethnicity + smsa + south,
              data = SchoolingReturns)
summary(m_iv)


#this comparison is from the modelsummary package
m_list <- list(OLS = m_ols, IV = m_iv)
msummary(m_list)
modelplot(m_list, coef_omit = "Intercept|experience")


# Card (1995) again #####
# We can replicate some parts of Table 3 from Card (1995)
# Data downloaded from https://davidcard.berkeley.edu/data_sets.html
# from paper: 
# "Using Geographic Variation in College Proximity to Estimate the Return to Schooling". 
#   In L.N. Christofides, E.K. Grant, and R. Swidinsky, editors, Aspects of Labor Market 
#   Behaviour: Essays in Honour of John Vanderkamp. Toronto: University of Toronto Press, 1995.
# Data also here: https://docs.google.com/spreadsheets/d/116KwP9SJUQ8LZHeIH-sTvlFFwCgkKIsz86exk1I7O1s/edit#gid=943363882
#  save as card.xlsx
# Inspired by Problem 4 from https://sites.google.com/view/microeconometricswithr/problems/problems-ch1

setwd("/Users/lukaslaffers/Dropbox/econx phd/code")
# this uses readxl package
data_card <- read_excel("card.xlsx")
data_card$lwage76 <- as.numeric(data_card$lwage76)
data_card2 <- data_card[is.na(data_card$lwage76)==0,]
data_card2$exp  <- data_card2$age76 - data_card2$ed76 - 6
data_card2$exp2 <- (data_card2$exp^2)/100

#intention to treat regression
lmITT <- lm(lwage76 ~ nearc4 + exp + exp2 + black + reg76r+
              smsa76r + smsa66r + reg662 + reg663 + reg664 +
              reg665 + reg666 + reg667 + reg668 + reg669,
            data=data_card2)
lmITT$coeff[2]

#first stage regression
lmFS <- lm(ed76 ~ nearc4 + exp + exp2 + black + reg76r+
              smsa76r + smsa66r + reg662 + reg663 + reg664 +
              reg665 + reg666 + reg667 + reg668 + reg669,
            data=data_card2)
lmFS$coeff[2]

#IV estimate is the fraction of the two
lmITT$coeff[2]/lmFS$coeff[2]

#compare this to regular OLS
lmOLS <- lm(lwage76 ~ ed76 + exp + exp2 + black + reg76r+
              smsa76r + smsa66r + reg662 + reg663 + reg664 +
              reg665 + reg666 + reg667 + reg668 + reg669,
            data=data_card2)
lmOLS$coeff[2]

attach(data_card2)

# IV Matrix algebra #####
#try the matrix algebra
n <- length(lwage76)

y <- lwage76
X <- cbind(rep(1,n),
           exp,exp2,black,reg76r,
           smsa76r,smsa66r,reg662,reg663,reg664,
           reg665,reg666,reg667,reg668,reg669,
           ed76)
Z <- cbind(rep(1,n),
           exp,exp2,black,reg76r,
           smsa76r,smsa66r,reg662,reg663,reg664,
           reg665,reg666,reg667,reg668,reg669,
           nearc4)

hatX <- Z %*% solve(t(Z) %*% Z) %*% t(Z) %*% X
(hatBeta <- solve(t(hatX) %*% hatX) %*% t(hatX) %*% y)

PZ <- Z %*% solve(t(Z) %*% Z) %*% t(Z)
(hatBeta_alt <- solve(t(X) %*% PZ %*% X) %*% t(X) %*% PZ %*% y)

#perform bootstrap to get standard errors
set.seed(132)
nB      <- 100
results <- numeric(nB)

for (iB in 1:nB){
  ind <- sample(1:n,rep=TRUE)
  
  print(iB)
  
  yb <- y[ind]
  Xb <- X[ind,]
  Zb <- Z[ind,]
  
  hatXb <- Zb %*% solve(t(Zb) %*% Zb) %*% t(Zb) %*% Xb
  results[iB] <- (solve(t(hatXb) %*% hatXb) %*% t(hatXb) %*% yb)[16]
}
sd(results)

# or use the ivreg function that we used before
lmod_iv <- ivreg(lwage76 ~ ed76 + exp + exp2 + black + reg76r+
                smsa76r + smsa66r + reg662 + reg663 + reg664 +
                reg665 + reg666 + reg667 + reg668 + reg669|
                nearc4 + exp + exp2 + black + reg76r+
                smsa76r + smsa66r + reg662 + reg663 + reg664 +
                reg665 + reg666 + reg667 + reg668 + reg669,
              data = data_card2)
summary(lmod_iv)


#is the nearc4 truly as good as random?
tbl_summary(data_card2 %>% select("ed76","exp","black","south66","smsa66r",
                                  "reg76r","smsa76r","nearc4"),
            by = "nearc4",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{p}% ({n} / {N})")
            )
#we can observe that there are very large differences


# now try a different instrument: 
# indicator whether parents were at home when the child was 14
# or use the ivreg function that we used before
lmod_iv2 <- ivreg(lwage76 ~ ed76 + exp + exp2 + black + reg76r+
                   smsa76r + smsa66r + reg662 + reg663 + reg664 +
                   reg665 + reg666 + reg667 + reg668 + reg669|
                   momdad14 + exp + exp2 + black + reg76r+
                   smsa76r + smsa66r + reg662 + reg663 + reg664 +
                   reg665 + reg666 + reg667 + reg668 + reg669,
                 data = data_card2)
summary(lmod_iv2)


# Replication of Table 4 AJR 2021 #####
#detailed description here: http://economics.mit.edu/faculty/acemoglu/data/ajr2001
# downloaded
# note that AJR data is also in "hdm" package, but there are small differences

AJR2<- read_dta(file = "AJRmaketable4.dta")
AJR2 <- AJR2 %>% filter(baseco==1)

#Column 1
# no variables, base sample
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2) )

#Column 2
# with Latitute, base sample
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2) )


#Column 3
# no variables, without 4 Neo-Europe countries
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2, subset=(rich4==0)) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2, subset=(rich4==0)) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2, subset=(rich4==0)) )

#Column 4
# with Latitute, without 4 Neo-Europe countries
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2, subset=(rich4==0)) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2, subset=(rich4==0)) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2, subset=(rich4==0)) )


#Column 5
# no variables, without Aftrica
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2, subset=(africa==0)) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2, subset=(africa==0)) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2, subset=(africa==0)) )

#Column 6
# with Latitute, without Aftrica
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2, subset=(africa==0)) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2, subset=(africa==0)) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2, subset=(africa==0)) )

AJR2$other_cont <- (AJR2$shortnam=="AUS") + (AJR2$shortnam=="MLT") + (AJR2$shortnam=="NZL")


#Column 7
# continent dummies
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + asia + africa + other_cont |
                            logem4 + asia + africa + other_cont, data = AJR2) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4 + asia + africa + other_cont, data = AJR2) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr + asia + africa + other_cont, data = AJR2) )

#Column 8
# continent dummies with Latitute
#IV
summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + asia + africa + other_cont + lat_abst|
                            logem4 + asia + africa + other_cont + lat_abst, data = AJR2) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4 + asia + africa + other_cont + lat_abst, data = AJR2) )
#OLS
summary( lmod_ols <- lm(logpgp95 ~ avexpr + asia + africa + other_cont + lat_abst, data = AJR2) )

#Column 9
# no variables, base sample but different outcome loghjypl - log output per worker
#IV
summary( lmod_iv <- ivreg(loghjypl ~ avexpr| logem4, data = AJR2) )
#First stage
summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2) )
#OLS
summary( lmod_ols <- lm(loghjypl ~ avexpr, data = AJR2) )