# Loading packages ##### #"pacman" is a package that makes loading easier if (!require("pacman")) install.packages("pacman") pacman::p_load(datasets, ggplot2, faraway, readxl, ivreg, modelsummary, gtsummary, hdm, tidyverse, haven, MatchIt, Zelig, stargazer, magrittr,optmatch, lmtest, sandwich, cobalt, dagitty,ggdag) setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # Example 11 - discrimination ##### # modified # https://mixtape.scunning.com/dag.html?panelset=r-code#discrimination-and-collider-bias G = dagitty('dag{ Y [outcome,pos="2, 0"] G [exposure,pos="0, 1"] X [confounder, pos="1,1"] D [unobserved, pos="0,0"] U [unobserved, pos="2, 1"] D -> Y D -> X X -> Y U -> X U -> Y G -> D}') #plot the DAG ggdag(G)+ theme_dag() #determine what are the minimal adjustment sets adjustmentSets( G, "G", "Y",effect="total" ) tb <- tibble( female = ifelse(runif(10000)>=0.5,1,0), ability = rnorm(10000), discrimination = female, occupation = 1 + 2*ability + 0*female - 2*discrimination + rnorm(10000), wage = 1 - 1*discrimination + 1*occupation + 2*ability + rnorm(10000) ) lm_1 <- lm(wage ~ female, tb) lm_2 <- lm(wage ~ female + occupation, tb) lm_3 <- lm(wage ~ female + occupation + ability, tb) stargazer(lm_1,lm_2,lm_3, type = "text", column.labels = c("Biased Unconditional", "Biased", "Unbiased Conditional")) # Small illustrative examples ##### # with the help from https://stats.stackexchange.com/questions/445578/how-do-dags-help-to-reduce-bias-in-causal-inference # EXAMPLE 1 #Confounding set.seed(4643) N <- 500 Smoking <- rnorm(N) Cancer <- Smoking + rnorm(N) Lighter <- Smoking + rnorm(N) #the association between cancer and lighter is confounded by common cause - smoking summary(lm(Cancer ~ Lighter)) #once we control for smoking, the association is gone summary(lm(Cancer ~ Lighter + Smoking)) Gcon = dagitty('dag{ C [outcome,pos="2, 0"] L [exposure,pos="1, 1"] S [confounder, pos="0,0"] S -> L S -> C}') #plot the DAG ggdag(Gcon)+ theme_dag() # EXAMPLE 2 #Mediation analysis set.seed(7243) N <- 1000 Grades <- rnorm(N, 10, 2) SelfEsteem <- Grades + rnorm(N) Happiness <- Grades + 2*SelfEsteem + rnorm(N) summary(lm(Happiness ~ Grades)) summary(lm(Happiness ~ Grades + SelfEsteem)) Gmed = dagitty('dag{ H [outcome,pos="2, 0"] S [confounder,pos="1, 1"] G [exposure, pos="0,0"] G -> S S -> H G -> H}') #plot the DAG ggdag(Gmed)+ theme_dag() # EXAMPLE 3 #Collider bias set.seed(5424) N <- 200 Talent <- rnorm(N) Looks <- rnorm(N) Success <- Talent + Looks + rnorm(N) #there is no correlation summary(lm(Talent ~ Looks)) #once we condition on a collider we can find some summary(lm(Talent ~ Looks + Success)) Gcol = dagitty('dag{ S [outcome,pos="2, 0"] T [confounder,pos="1, 1"] L [exposure, pos="0,0"] T -> S L -> S}') #plot the DAG ggdag(Gcol)+ theme_dag() # Simpson's machine ##### #http://dagitty.net/learn/simpson/index.html # Dagitty Example ##### # More elaborate example # https://www.kaggle.com/victorchernozhukov/identification-analysis-of-401-k-example-w-dags # Victor Chernozhukov's lecture on DAGs - he takes all the credit # This example demonstrates nicely why DAGs are useful beyond super-simple toy examples # that you typically see # 401k is a pension saving programme that provides some tax benefits to employees # Does eligilibity for 401k have an effect on savings? # Y - outcome - net financial assets # X - worker characteristics (income, family characteristics, other retirement plans) # F - unobserved firm characterstics # D - treatment - 401k eligibility # A - unobserved variables that jointly affect D,F,X # M - matched amount (by employer) #define the DAG G1 = dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] F [unobserved, pos="0, -1"] D -> Y X -> D F -> X F -> D X -> Y}') #plot the DAG ggdag(G1)+ theme_dag() #determine what are the minimal adjustment sets adjustmentSets( G1, "D", "Y",effect="total" ) #look at the relatives print(parents(G1, "D")) print(children(G1, "D")) print(ancestors(G1, "D")) print(descendants(G1, "D")) #look at the paths paths(G1, "D", "Y") #look at the conditional independencies that are implied by this DAG print( impliedConditionalIndependencies(G) ) #we can list all the ATE that are identifiable by conditioning for( n in names(G1) ){ for( m in children(G1,n) ){ a <- adjustmentSets( G1, n, m ) if( length(a) > 0 ){ cat("The effect ",n,"->",m, " is identifiable by controlling for:\n",sep="") print( a, prefix=" * " ) } } } #plot all the DAGs that are observationally equivalent plot(equivalenceClass(G1)) G3 = dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] F [unobserved, pos="0, -1"] A [unobserved, pos="-1, -1"] D -> Y X -> D F -> D A -> F A -> X A -> D X -> Y}') adjustmentSets( G3, "D", "Y", effect="total" ) ggdag(G3)+ theme_dag() # controlling for X only is still fine # it satisfies the backdoor criterion as it closes all the backdoor paths # if A directly affects Y too, the graph is the following G4 = dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] F [unobserved, pos="0, -1"] A [unobserved, pos="-1, -1"] D -> Y X -> D F -> D A -> F A -> X A -> D F -> Y X -> Y}') ggdag(G4)+ theme_dag() adjustmentSets( G4, "D", "Y",effect="total" ) #there are no adjustment set. # The reason is the D <- F -> Y path that cannot be closed # what if there is a mediator variable M G5 = dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] F [unobserved, pos="0, -1"] A [unobserved, pos="-1, -1"] M [unobserved, pos="2, -.5"] D -> Y X -> D F -> D A -> F A -> X A -> D D -> M M -> Y X -> M X -> Y}') ggdag(G5)+ theme_dag() print( adjustmentSets( G5, "D", "Y",effect="total" ) ) # adjusting for X is fine for the identification of the main effect #Introduce link between F and M G6 = dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] F [confounder, pos="0, -1"] A [unobserved, pos="-1, -1"] M [uobserved, pos="2, -.5"] D -> Y X -> D F -> D A -> F A -> X D -> M F -> M A -> D M -> Y X -> M X -> Y}') ggdag(G6)+ theme_dag() print( adjustmentSets( G6, "D", "Y" ),effect="total" ) #no adjustment set again #here are the paths paths(G6, "D", "Y") # Matching Example: Titanic ##### #modified version of: #https://mixtape.scunning.com/matching-and-subclassification.html?panelset1=r-code2&panelset=r-code read_data <- function(df) { full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", df, sep = "") df <- read_dta(full_path) return(df) } titanic <- read_data("titanic.dta") %>% mutate(d = case_when(class == 1 ~ 1, TRUE ~ 0)) ey1 <- titanic %>% filter(d == 1) %>% pull(survived) %>% mean() ey0 <- titanic %>% filter(d == 0) %>% pull(survived) %>% mean() (sdo <- ey1 - ey0) titanic %<>% mutate(s = case_when(sex == 0 & age == 1 ~ 1, sex == 0 & age == 0 ~ 2, sex == 1 & age == 1 ~ 3, sex == 1 & age == 0 ~ 4, TRUE ~ 0)) ey11 <- titanic %>% filter(s == 1 & d == 1) %$% mean(survived) ey10 <- titanic %>% filter(s == 1 & d == 0) %$% mean(survived) ey21 <- titanic %>% filter(s == 2 & d == 1) %$% mean(survived) ey20 <- titanic %>% filter(s == 2 & d == 0) %$% mean(survived) ey31 <- titanic %>% filter(s == 3 & d == 1) %$% mean(survived) ey30 <- titanic %>% filter(s == 3 & d == 0) %$% mean(survived) ey41 <- titanic %>% filter(s == 4 & d == 1) %$% mean(survived) ey40 <- titanic %>% filter(s == 4 & d == 0) %$% mean(survived) diff1 = ey11 - ey10 diff2 = ey21 - ey20 diff3 = ey31 - ey30 diff4 = ey41 - ey40 obsNT = nrow(titanic %>% filter(d == 0)) wt1NT <- titanic %>% filter(s == 1 & d == 0) %$% nrow(.)/obsNT wt2NT <- titanic %>% filter(s == 2 & d == 0) %$% nrow(.)/obsNT wt3NT <- titanic %>% filter(s == 3 & d == 0) %$% nrow(.)/obsNT wt4NT <- titanic %>% filter(s == 4 & d == 0) %$% nrow(.)/obsNT obsT = nrow(titanic %>% filter(d == 1)) wt1T <- titanic %>% filter(s == 1 & d == 1) %$% nrow(.)/obsT wt2T <- titanic %>% filter(s == 2 & d == 1) %$% nrow(.)/obsT wt3T <- titanic %>% filter(s == 3 & d == 1) %$% nrow(.)/obsT wt4T <- titanic %>% filter(s == 4 & d == 1) %$% nrow(.)/obsT obs = nrow(titanic) wt1 <- titanic %>% filter(s == 1) %$% nrow(.)/obs wt2 <- titanic %>% filter(s == 2) %$% nrow(.)/obs wt3 <- titanic %>% filter(s == 3) %$% nrow(.)/obs wt4 <- titanic %>% filter(s == 4) %$% nrow(.)/obs wateNT = diff1*wt1NT + diff2*wt2NT + diff3*wt3NT + diff4*wt4NT wateT = diff1*wt1T + diff2*wt2T + diff3*wt3T + diff4*wt4T wate = diff1*wt1 + diff2*wt2 + diff3*wt3 + diff4*wt4 stargazer(sdo, wate, wateT, wateNT , type = "text") # Regression and Matching ##### # explained in p21 in NBER wp 5192 version of Angrist (1995) # p11 in this presentation https://ocw.mit.edu/courses/economics/14-387-applied-econometrics-mostly-harmless-big-data-fall-2014/lecture-and-recitation-notes/MIT14_387F14_Matching.pdf # p71 in Mostly Harmless Econometrics by Angrist and Pischke N <- 1000 set.seed(849) U_D <- rnorm(N) U_X <- rnorm(N) X <- 1*(U_X > 0) D <- 1*(U_D > 0) #matching and regression will be similar, despite heterogenous effects #D <- 1*(U_D - U_X > 0) #dependence between X and D is strong e <- rnorm(N) beta_0 <- 1 beta_1 <- 1 delta_true_0 <- 1 delta_true_1 <- 5 # this is the full model Y <- beta_0 + beta_1*X + delta_true_0*D*(1-X) + delta_true_1*D*X + e #Regression lm(Y ~ D + X) #careful, there is no interaction term here(!) # regression assumes homogenous effects #this gives us the same value tildeD <- residuals(lm(D ~ X)) cov(Y,tildeD)/var(tildeD) #think about this in terms of projections #these are the effects (under unconfoundedness) delta1 <- mean(Y[X*D==1]) - mean(Y[X*(1-D)==1]) delta0 <- mean(Y[(1-X)*D==1]) - mean(Y[(1-X)*(1-D)==1]) w1 <- (mean(X*D==1)/mean(X==1)) * ( 1 - mean(X*D==1)/mean(X==1) ) * mean(X==1) w0 <- (mean((1-X)*D==1)/mean((1-X)==1)) * ( 1 - mean((1-X)*D==1)/mean((1-X)==1) ) * mean((1-X)==1) w1R <- w1/(w1+w0) w0R <- w0/(w1+w0) #regression is weighted sum of different effects deltaR <- delta1*w1R + delta0*w0R #(even though there was no interaction term in the regression # but that doesn't stop us from writing it down like this) #Matching w1M <- mean(X*D==1)/mean(D==1) w0M <- mean((1-X)*D==1)/mean(D==1) deltaM <- delta1*w1M + delta0*w0M #compare the two c(deltaR, deltaM) #matching placed more weight on delta1 # depending on which of the two # Pr(D=1|X=1) and Pr(D=1|X=0) # is larger c(w1M,w0M) #regression placed value based on variance of E(D|X) # E(D|X) = Pr(D=1|X) # this was similar for both c(w1R,w0R) #heterogeneity on it's own does not create a lot of problems # it is heterogeneity PLUS different weights that will lead to differences # PSM and IPW ##### # source: https://mixtape.scunning.com/matching-and-subclassification.html read_data <- function(df) { full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", df, sep = "") df <- read_dta(full_path) return(df) } nsw_dw <- read_data("nsw_mixtape.dta") #look at the summary statistics of treated nsw_dw %>% filter(treat == 1) %>% summary(re78) mean1 <- nsw_dw %>% filter(treat == 1) %>% pull(re78) %>% mean() nsw_dw$y1 <- mean1 #look at the summary statistics of controls nsw_dw %>% filter(treat == 0) %>% summary(re78) mean0 <- nsw_dw %>% filter(treat == 0) %>% pull(re78) %>% mean() nsw_dw$y0 <- mean0 #raw mean ate <- unique(nsw_dw$y1 - nsw_dw$y0) # remove y1,y0 nsw_dw <- nsw_dw %>% filter(treat == 1) %>% select(-y1, -y0) nsw_dw_cpscontrol <- read_data("cps_mixtape.dta") %>% bind_rows(nsw_dw) %>% mutate(agesq = age^2, agecube = age^3, educsq = educ*educ, u74 = case_when(re74 == 0 ~ 1, TRUE ~ 0), u75 = case_when(re75 == 0 ~ 1, TRUE ~ 0), interaction1 = educ*re74, re74sq = re74^2, re75sq = re75^2, interaction2 = u74*hisp) # estimation of logit regression logit_nsw <- glm(treat ~ age + agesq + agecube + educ + educsq + marr + nodegree + black + hisp + re74 + re75 + u74 + u75 + interaction1, family = binomial(link = "logit"), data = nsw_dw_cpscontrol) #create a new variable "pscore" nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(pscore = logit_nsw$fitted.values) # mean pscore pscore_control <- nsw_dw_cpscontrol %>% filter(treat == 0) %>% pull(pscore) %>% mean() pscore_treated <- nsw_dw_cpscontrol %>% filter(treat == 1) %>% pull(pscore) %>% mean() # histogram of pscores nsw_dw_cpscontrol %>% filter(treat == 0) %>% ggplot() + geom_histogram(aes(x = pscore)) nsw_dw_cpscontrol %>% filter(treat == 1) %>% ggplot() + geom_histogram(aes(x = pscore)) #continuation N <- nrow(nsw_dw_cpscontrol) #- Manual with non-normalized weights using all data nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(d1 = treat/pscore, d0 = (1-treat)/(1-pscore)) #save denominators (for weighting) s1 <- sum(nsw_dw_cpscontrol$d1) s0 <- sum(nsw_dw_cpscontrol$d0) nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(y1 = treat * re78/pscore, y0 = (1-treat) * re78/(1-pscore), ht = y1 - y0) #- Manual with normalized weights nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(y1 = (treat*re78/pscore)/(s1/N), y0 = ((1-treat)*re78/(1-pscore))/(s0/N), norm = y1 - y0) #calculate means nsw_dw_cpscontrol %>% pull(ht) %>% mean() nsw_dw_cpscontrol %>% pull(norm) %>% mean() #these numbers are huge! #-- trimming propensity score nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% select(-d1, -d0, -y1, -y0, -ht, -norm) %>% filter(!(pscore >= 0.9)) %>% filter(!(pscore <= 0.1)) N <- nrow(nsw_dw_cpscontrol) #- Manual with non-normalized weights using trimmed data nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(d1 = treat/pscore, d0 = (1-treat)/(1-pscore)) s1 <- sum(nsw_dw_cpscontrol$d1) s0 <- sum(nsw_dw_cpscontrol$d0) nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(y1 = treat * re78/pscore, y0 = (1-treat) * re78/(1-pscore), ht = y1 - y0) #- Manual with normalized weights with trimmed data nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(y1 = (treat*re78/pscore)/(s1/N), y0 = ((1-treat)*re78/(1-pscore))/(s0/N), norm = y1 - y0) nsw_dw_cpscontrol %>% pull(ht) %>% mean() nsw_dw_cpscontrol %>% pull(norm) %>% mean() #much more reasonable numbers, closer to LaLonde (1986) numbers # Matching in R "MatchIt package" ##### # https://cran.r-project.org/web/packages/MatchIt/vignettes/MatchIt.html#introduction data("lalonde") head(lalonde) m.out0 <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, method = NULL, distance = "glm") # Checking balance prior to matching summary(m.out0) # 1:1 NN PS matching w/o replacement m.out1 <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, method = "nearest", distance = "glm") m.out1 # Checking balance after NN matching summary(m.out1, un = FALSE) #distribution of propensity scores plot(m.out1, type = "jitter", interactive = FALSE) #qq plots plot(m.out1, type = "qq", interactive = FALSE, which.xs = c("age", "married", "re75")) # Full matching on a probit PS m.out2 <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, method = "full", distance = "glm", link = "probit") m.out2 # Checking balance after full matching summary(m.out2, un = FALSE) # Balance now look much better plot(summary(m.out2)) #look at the matched data m.data1 <- match.data(m.out1) head(m.data1) #regression estimation from m.out1 fit1 <- lm(re78 ~ treat + age + educ + race + married + nodegree + re74 + re75, data = m.data1, weights = weights) coeftest(fit1, vcov. = vcovCL, cluster = ~subclass) m.data2 <- match.data(m.out2) #regression estimation from m.out2 (full matching) fit2 <- lm(re78 ~ treat + age + educ + race + married + nodegree + re74 + re75, data = m.data2, weights = weights) coeftest(fit2, vcov. = vcovCL, cluster = ~subclass) # the second matching achieved better balance # so we tend to believe this more # Accessing balance in R "MatchIt package" ##### # The purpose of all these graphs is to do one thing: # [[[Compare the distributions between treated and matched controls.]]] # Visualization is often a tool that works quite well here. # Our eyes are typically quite good in spotting the differences. data("lalonde", package = "MatchIt") #Full matching on a logistic regression PS m.out <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, method = "full") m.out summary(m.out, addlvariables = ~ I(age^2) + I(re74==0) + I(re75==0) + educ:race) m.sum <- summary(m.out, addlvariables = ~ I(age^2) + I(re74==0) + I(re75==0) + educ:race) plot(m.sum, var.order = "unmatched") #eQQ plot plot(m.out, type = "qq", which.xs = c("age", "nodegree", "re74")) #eCDF plot plot(m.out, type = "ecdf", which.xs = c("educ", "married", "re75")) #Subclassification on a logistic regression PS s.out <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, method = "subclass", subclass = 4) s.out summary(s.out) summary(s.out, subclass = TRUE, un = FALSE) s <- summary(s.out, subclass = TRUE) plot(s, var.order = "unmatched", abs = FALSE) plot(s.out, type = "histogram", which.xs = c("educ", "married", "re75"), subclass = 1) #library("cobalt") #library for beautiful balance graphs bal.tab(m.out, un = TRUE, stats = c("m", "v", "ks")) bal.tab(m.out, un = TRUE, stats = c("m", "v", "ks")) #Nearest neighbor (NN) matching on the PS m.out2 <- matchit(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde) #Balance on covariates after full and NN matching bal.tab(treat ~ age + educ + race + married + nodegree + re74 + re75, data = lalonde, un = TRUE, weights = list(full = m.out, nn = m.out2)) love.plot(m.out, binary = "std") love.plot(m.out, stats = c("m", "ks"), poly = 2, abs = TRUE, weights = list(nn = m.out2), drop.distance = TRUE, thresholds = c(m = .1), var.order = "unadjusted", binary = "std", shapes = c("triangle", "square", "circle"), colors = c("blue", "darkgreen", "red"), sample.names = c("Full Matching", "NN Matching", "Original"), position = "bottom") #Density plot for continuous variables bal.plot(m.out, var.name = "educ", which = "both") #Bar graph for categorical variables bal.plot(m.out, var.name = "race", which = "both") #Mirrored histogram bal.plot(m.out, var.name = "distance", which = "both", type = "histogram", mirror = TRUE)