# Loading packages ##### #"pacman" is a package that makes loading easier if (!require("pacman")) install.packages("pacman") pacman::p_load(datasets, ggplot2, faraway, readxl, ivreg, modelsummary, gtsummary, hdm, tidyverse, haven) setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # Returns to schooling example ##### # Endogenous education is instrumented by nearby college dummy #https://cran.r-project.org/web/packages/ivreg/vignettes/ivreg.html data("SchoolingReturns", package = "ivreg") summary(SchoolingReturns[, 1:8]) #run OLS regression m_ols <- lm(log(wage) ~ education + poly(experience, 2) + ethnicity + smsa + south, data = SchoolingReturns) summary(m_ols) #nice, so many stars! #run IV regression, use nearcollege as an instrument m_iv <- ivreg(log(wage) ~ education + poly(experience, 2) + ethnicity + smsa + south | nearcollege + poly(age, 2) + ethnicity + smsa + south, data = SchoolingReturns) summary(m_iv) #this comparison is from the modelsummary package m_list <- list(OLS = m_ols, IV = m_iv) msummary(m_list) modelplot(m_list, coef_omit = "Intercept|experience") # Card (1995) again ##### # We can replicate some parts of Table 3 from Card (1995) # Data downloaded from https://davidcard.berkeley.edu/data_sets.html # from paper: # "Using Geographic Variation in College Proximity to Estimate the Return to Schooling". # In L.N. Christofides, E.K. Grant, and R. Swidinsky, editors, Aspects of Labor Market # Behaviour: Essays in Honour of John Vanderkamp. Toronto: University of Toronto Press, 1995. # Data also here: https://docs.google.com/spreadsheets/d/116KwP9SJUQ8LZHeIH-sTvlFFwCgkKIsz86exk1I7O1s/edit#gid=943363882 # save as card.xlsx # Inspired by Problem 4 from https://sites.google.com/view/microeconometricswithr/problems/problems-ch1 setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # this uses readxl package data_card <- read_excel("card.xlsx") data_card$lwage76 <- as.numeric(data_card$lwage76) data_card2 <- data_card[is.na(data_card$lwage76)==0,] data_card2$exp <- data_card2$age76 - data_card2$ed76 - 6 data_card2$exp2 <- (data_card2$exp^2)/100 #intention to treat regression lmITT <- lm(lwage76 ~ nearc4 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669, data=data_card2) lmITT$coeff[2] #first stage regression lmFS <- lm(ed76 ~ nearc4 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669, data=data_card2) lmFS$coeff[2] #IV estimate is the fraction of the two lmITT$coeff[2]/lmFS$coeff[2] #compare this to regular OLS lmOLS <- lm(lwage76 ~ ed76 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669, data=data_card2) lmOLS$coeff[2] attach(data_card2) # IV Matrix algebra ##### #try the matrix algebra n <- length(lwage76) y <- lwage76 X <- cbind(rep(1,n), exp,exp2,black,reg76r, smsa76r,smsa66r,reg662,reg663,reg664, reg665,reg666,reg667,reg668,reg669, ed76) Z <- cbind(rep(1,n), exp,exp2,black,reg76r, smsa76r,smsa66r,reg662,reg663,reg664, reg665,reg666,reg667,reg668,reg669, nearc4) hatX <- Z %*% solve(t(Z) %*% Z) %*% t(Z) %*% X (hatBeta <- solve(t(hatX) %*% hatX) %*% t(hatX) %*% y) PZ <- Z %*% solve(t(Z) %*% Z) %*% t(Z) (hatBeta_alt <- solve(t(X) %*% PZ %*% X) %*% t(X) %*% PZ %*% y) #perform bootstrap to get standard errors set.seed(132) nB <- 100 results <- numeric(nB) for (iB in 1:nB){ ind <- sample(1:n,rep=TRUE) print(iB) yb <- y[ind] Xb <- X[ind,] Zb <- Z[ind,] hatXb <- Zb %*% solve(t(Zb) %*% Zb) %*% t(Zb) %*% Xb results[iB] <- (solve(t(hatXb) %*% hatXb) %*% t(hatXb) %*% yb)[16] } sd(results) # or use the ivreg function that we used before lmod_iv <- ivreg(lwage76 ~ ed76 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669| nearc4 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669, data = data_card2) summary(lmod_iv) #is the nearc4 truly as good as random? tbl_summary(data_card2 %>% select("ed76","exp","black","south66","smsa66r", "reg76r","smsa76r","nearc4"), by = "nearc4", statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{p}% ({n} / {N})") ) #we can observe that there are very large differences # now try a different instrument: # indicator whether parents were at home when the child was 14 # or use the ivreg function that we used before lmod_iv2 <- ivreg(lwage76 ~ ed76 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669| momdad14 + exp + exp2 + black + reg76r+ smsa76r + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669, data = data_card2) summary(lmod_iv2) # Replication of Table 4 AJR 2021 ##### #detailed description here: http://economics.mit.edu/faculty/acemoglu/data/ajr2001 # downloaded # note that AJR data is also in "hdm" package, but there are small differences AJR2<- read_dta(file = "AJRmaketable4.dta") AJR2 <- AJR2 %>% filter(baseco==1) #Column 1 # no variables, base sample #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2) ) #Column 2 # with Latitute, base sample #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2) ) #Column 3 # no variables, without 4 Neo-Europe countries #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2, subset=(rich4==0)) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2, subset=(rich4==0)) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2, subset=(rich4==0)) ) #Column 4 # with Latitute, without 4 Neo-Europe countries #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2, subset=(rich4==0)) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2, subset=(rich4==0)) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2, subset=(rich4==0)) ) #Column 5 # no variables, without Aftrica #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr| logem4, data = AJR2, subset=(africa==0)) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2, subset=(africa==0)) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr, data = AJR2, subset=(africa==0)) ) #Column 6 # with Latitute, without Aftrica #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + lat_abst| logem4 + lat_abst, data = AJR2, subset=(africa==0)) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4 + lat_abst, data = AJR2, subset=(africa==0)) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr + lat_abst, data = AJR2, subset=(africa==0)) ) AJR2$other_cont <- (AJR2$shortnam=="AUS") + (AJR2$shortnam=="MLT") + (AJR2$shortnam=="NZL") #Column 7 # continent dummies #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + asia + africa + other_cont | logem4 + asia + africa + other_cont, data = AJR2) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4 + asia + africa + other_cont, data = AJR2) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr + asia + africa + other_cont, data = AJR2) ) #Column 8 # continent dummies with Latitute #IV summary( lmod_iv <- ivreg(logpgp95 ~ avexpr + asia + africa + other_cont + lat_abst| logem4 + asia + africa + other_cont + lat_abst, data = AJR2) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4 + asia + africa + other_cont + lat_abst, data = AJR2) ) #OLS summary( lmod_ols <- lm(logpgp95 ~ avexpr + asia + africa + other_cont + lat_abst, data = AJR2) ) #Column 9 # no variables, base sample but different outcome loghjypl - log output per worker #IV summary( lmod_iv <- ivreg(loghjypl ~ avexpr| logem4, data = AJR2) ) #First stage summary( lmod_fs <- lm(avexpr ~ logem4, data = AJR2) ) #OLS summary( lmod_ols <- lm(loghjypl ~ avexpr, data = AJR2) )