# Loading packages #####
#"pacman" is a package that makes loading easier
if (!require("pacman")) install.packages("pacman")
pacman::p_load(datasets, ggplot2, faraway, readxl, 
               ivreg, modelsummary, gtsummary, hdm, 
               tidyverse, haven, MatchIt, Zelig,
               stargazer, magrittr,optmatch,
               lmtest, sandwich, cobalt, 
               dagitty,ggdag)
setwd("/Users/lukaslaffers/Dropbox/econx phd/code")

# Example 11 - discrimination #####
# modified
# https://mixtape.scunning.com/dag.html?panelset=r-code#discrimination-and-collider-bias

G = dagitty('dag{
Y [outcome,pos="2, 0"]
G [exposure,pos="0, 1"]
X [confounder, pos="1,1"]
D [unobserved, pos="0,0"]
U [unobserved, pos="2, 1"]
D -> Y
D -> X
X -> Y
U -> X
U -> Y
G -> D}')

#plot the DAG
ggdag(G)+  theme_dag()

#determine what are the minimal adjustment sets
adjustmentSets( G, "G", "Y",effect="total" )


tb <- tibble(
  female = ifelse(runif(10000)>=0.5,1,0),
  ability = rnorm(10000),
  discrimination = female,
  occupation = 1 + 2*ability + 0*female - 2*discrimination + rnorm(10000),
  wage = 1 - 1*discrimination + 1*occupation + 2*ability + rnorm(10000) 
)

lm_1 <- lm(wage ~ female, tb)
lm_2 <- lm(wage ~ female + occupation, tb)
lm_3 <- lm(wage ~ female + occupation + ability, tb)

stargazer(lm_1,lm_2,lm_3, type = "text", 
          column.labels = c("Biased Unconditional", 
                            "Biased",
                            "Unbiased Conditional"))


# Small illustrative examples #####
# with the help from https://stats.stackexchange.com/questions/445578/how-do-dags-help-to-reduce-bias-in-causal-inference

# EXAMPLE 1
#Confounding
set.seed(4643)
N       <- 500
Smoking <- rnorm(N)
Cancer  <- Smoking + rnorm(N)
Lighter <- Smoking + rnorm(N)
#the association between cancer and lighter is confounded by common cause - smoking
summary(lm(Cancer ~ Lighter)) 
#once we control for smoking, the association is gone
summary(lm(Cancer ~ Lighter + Smoking)) 

Gcon = dagitty('dag{
C [outcome,pos="2, 0"]
L [exposure,pos="1, 1"]
S [confounder, pos="0,0"]
S -> L
S -> C}')

#plot the DAG
ggdag(Gcon)+  theme_dag()


# EXAMPLE 2
#Mediation analysis
set.seed(7243)
N          <- 1000
Grades     <- rnorm(N, 10, 2)
SelfEsteem <- Grades + rnorm(N)
Happiness  <- Grades + 2*SelfEsteem + rnorm(N)
summary(lm(Happiness ~ Grades))
summary(lm(Happiness ~ Grades + SelfEsteem))

Gmed = dagitty('dag{
H [outcome,pos="2, 0"]
S [confounder,pos="1, 1"]
G [exposure, pos="0,0"]
G -> S
S -> H
G -> H}')

#plot the DAG
ggdag(Gmed)+  theme_dag()


# EXAMPLE 3
#Collider bias
set.seed(5424)
N       <- 200
Talent  <- rnorm(N)
Looks   <- rnorm(N)
Success <- Talent + Looks + rnorm(N)
#there is no correlation
summary(lm(Talent ~ Looks))
#once we condition on a collider we can find some
summary(lm(Talent ~ Looks + Success))

Gcol = dagitty('dag{
S [outcome,pos="2, 0"]
T [confounder,pos="1, 1"]
L [exposure, pos="0,0"]
T -> S
L -> S}')

#plot the DAG
ggdag(Gcol)+  theme_dag()


# Simpson's machine #####
  #http://dagitty.net/learn/simpson/index.html
  
  
# Dagitty Example #####
# More elaborate example
# https://www.kaggle.com/victorchernozhukov/identification-analysis-of-401-k-example-w-dags
# Victor Chernozhukov's lecture on DAGs - he takes all the credit
# This example demonstrates nicely why DAGs are useful beyond super-simple toy examples
# that you typically see

# 401k is a pension saving programme that provides some tax benefits to employees
# Does eligilibity for 401k have an effect on savings?

# Y - outcome - net financial assets
# X - worker characteristics (income, family characteristics, other retirement plans)
# F - unobserved firm characterstics
# D - treatment - 401k eligibility
# A - unobserved variables that jointly affect D,F,X
# M - matched amount (by employer)

#define the DAG
G1 = dagitty('dag{
Y [outcome,pos="4, 0"]
D [exposure,pos="0, 0"]
X [confounder, pos="2,-2"]
F [unobserved, pos="0, -1"]
D -> Y
X -> D
F -> X
F -> D
X -> Y}')

#plot the DAG
ggdag(G1)+  theme_dag()

#determine what are the minimal adjustment sets
adjustmentSets( G1, "D", "Y",effect="total" )

#look at the relatives
print(parents(G1, "D"))
print(children(G1, "D"))
print(ancestors(G1, "D"))
print(descendants(G1, "D"))

#look at the paths
paths(G1, "D", "Y")

#look at the conditional independencies that are implied by this DAG
print( impliedConditionalIndependencies(G) )

#we can list all the ATE that are identifiable by conditioning
for( n in names(G1) ){
  for( m in children(G1,n) ){
    a <- adjustmentSets( G1, n, m )
    if( length(a) > 0 ){
      cat("The effect ",n,"->",m,
          " is identifiable by controlling for:\n",sep="")
      print( a, prefix=" * " )
    }
  }
}

#plot all the DAGs that are observationally equivalent
plot(equivalenceClass(G1))


G3 = dagitty('dag{
Y [outcome,pos="4, 0"]
D [exposure,pos="0, 0"]
X [confounder, pos="2,-2"]
F [unobserved, pos="0, -1"]
A [unobserved, pos="-1, -1"]
D -> Y
X -> D
F -> D
A -> F
A -> X
A -> D
X -> Y}')


adjustmentSets( G3, "D", "Y", effect="total"  )

ggdag(G3)+  theme_dag()
# controlling for X only is still fine
# it satisfies the backdoor criterion as it closes all the backdoor paths


# if A directly affects Y too, the graph is the following
G4 = dagitty('dag{
Y [outcome,pos="4, 0"]
D [exposure,pos="0, 0"]
X [confounder, pos="2,-2"]
F [unobserved, pos="0, -1"]
A [unobserved, pos="-1, -1"]
D -> Y
X -> D
F -> D
A -> F
A -> X
A -> D
F -> Y
X -> Y}')


ggdag(G4)+  theme_dag()
adjustmentSets( G4, "D", "Y",effect="total"  )
#there are no adjustment set. 
# The reason is the D <- F -> Y path that cannot be closed


# what if there is a mediator variable M
G5 = dagitty('dag{
Y [outcome,pos="4, 0"]
D [exposure,pos="0, 0"]
X [confounder, pos="2,-2"]
F [unobserved, pos="0, -1"]
A [unobserved, pos="-1, -1"]
M [unobserved, pos="2, -.5"]
D -> Y
X -> D
F -> D
A -> F
A -> X
A -> D
D -> M
M -> Y
X -> M
X -> Y}')

ggdag(G5)+  theme_dag()
print( adjustmentSets( G5, "D", "Y",effect="total"  ) )
# adjusting for X is fine for the identification of the main effect


#Introduce link between F and M
G6 = dagitty('dag{
Y [outcome,pos="4, 0"]
D [exposure,pos="0, 0"]
X [confounder, pos="2,-2"]
F [confounder, pos="0, -1"]
A [unobserved, pos="-1, -1"]
M [uobserved, pos="2, -.5"]
D -> Y
X -> D
F -> D
A -> F
A -> X
D -> M
F -> M
A -> D
M -> Y
X -> M
X -> Y}')

ggdag(G6)+  theme_dag()
print( adjustmentSets( G6, "D", "Y" ),effect="total"  )
#no adjustment set again

#here are the paths
paths(G6, "D", "Y")


# Matching Example: Titanic #####
#modified version of:
#https://mixtape.scunning.com/matching-and-subclassification.html?panelset1=r-code2&panelset=r-code

read_data <- function(df)
{
  full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", 
                     df, sep = "")
  df <- read_dta(full_path)
  return(df)
}

titanic <- read_data("titanic.dta") %>% 
  mutate(d = case_when(class == 1 ~ 1, TRUE ~ 0))

ey1 <- titanic %>% filter(d == 1) %>% pull(survived) %>% mean()
ey0 <- titanic %>% filter(d == 0) %>% pull(survived) %>% mean()
(sdo <- ey1 - ey0)


titanic %<>%
  mutate(s = case_when(sex == 0 & age == 1 ~ 1,
                       sex == 0 & age == 0 ~ 2,
                       sex == 1 & age == 1 ~ 3,
                       sex == 1 & age == 0 ~ 4,
                       TRUE ~ 0))

ey11 <- titanic %>% filter(s == 1 & d == 1) %$% mean(survived)
ey10 <- titanic %>% filter(s == 1 & d == 0) %$% mean(survived)
ey21 <- titanic %>% filter(s == 2 & d == 1) %$% mean(survived)
ey20 <- titanic %>% filter(s == 2 & d == 0) %$% mean(survived)
ey31 <- titanic %>% filter(s == 3 & d == 1) %$% mean(survived)
ey30 <- titanic %>% filter(s == 3 & d == 0) %$% mean(survived)
ey41 <- titanic %>% filter(s == 4 & d == 1) %$% mean(survived)
ey40 <- titanic %>% filter(s == 4 & d == 0) %$% mean(survived)

diff1 = ey11 - ey10
diff2 = ey21 - ey20
diff3 = ey31 - ey30
diff4 = ey41 - ey40

obsNT = nrow(titanic %>% filter(d == 0))
wt1NT <- titanic %>% filter(s == 1 & d == 0) %$% nrow(.)/obsNT
wt2NT <- titanic %>% filter(s == 2 & d == 0) %$% nrow(.)/obsNT
wt3NT <- titanic %>% filter(s == 3 & d == 0) %$% nrow(.)/obsNT
wt4NT <- titanic %>% filter(s == 4 & d == 0) %$% nrow(.)/obsNT

obsT = nrow(titanic %>% filter(d == 1))
wt1T <- titanic %>% filter(s == 1 & d == 1) %$% nrow(.)/obsT
wt2T <- titanic %>% filter(s == 2 & d == 1) %$% nrow(.)/obsT
wt3T <- titanic %>% filter(s == 3 & d == 1) %$% nrow(.)/obsT
wt4T <- titanic %>% filter(s == 4 & d == 1) %$% nrow(.)/obsT

obs = nrow(titanic)
wt1 <- titanic %>% filter(s == 1) %$% nrow(.)/obs
wt2 <- titanic %>% filter(s == 2) %$% nrow(.)/obs
wt3 <- titanic %>% filter(s == 3) %$% nrow(.)/obs
wt4 <- titanic %>% filter(s == 4) %$% nrow(.)/obs


wateNT = diff1*wt1NT + diff2*wt2NT + diff3*wt3NT + diff4*wt4NT
wateT  = diff1*wt1T  + diff2*wt2T  + diff3*wt3T  + diff4*wt4T
wate   = diff1*wt1   + diff2*wt2   + diff3*wt3   + diff4*wt4

stargazer(sdo, wate, wateT,  wateNT , type = "text")


# Regression and Matching #####
# explained in p21 in NBER wp 5192 version of Angrist (1995)
# p11 in this presentation https://ocw.mit.edu/courses/economics/14-387-applied-econometrics-mostly-harmless-big-data-fall-2014/lecture-and-recitation-notes/MIT14_387F14_Matching.pdf
# p71 in Mostly Harmless Econometrics by Angrist and Pischke

N <- 1000
set.seed(849)
U_D <- rnorm(N)
U_X <- rnorm(N)

X <- 1*(U_X > 0) 
D <- 1*(U_D > 0)        #matching and regression will be similar, despite heterogenous effects
#D <- 1*(U_D - U_X > 0)   #dependence between X and D is strong
e <- rnorm(N)
beta_0  <- 1
beta_1  <- 1
delta_true_0 <- 1
delta_true_1 <- 5

# this is the full model
Y <- beta_0 + beta_1*X + delta_true_0*D*(1-X) + delta_true_1*D*X + e

#Regression
lm(Y ~ D + X)
#careful, there is no interaction term here(!)
# regression assumes homogenous effects

#this gives us the same value
tildeD <- residuals(lm(D ~ X))
cov(Y,tildeD)/var(tildeD)
#think about this in terms of projections

#these are the effects (under unconfoundedness)
delta1 <- mean(Y[X*D==1]) - mean(Y[X*(1-D)==1])
delta0 <- mean(Y[(1-X)*D==1]) - mean(Y[(1-X)*(1-D)==1])

w1 <- (mean(X*D==1)/mean(X==1))         * ( 1 - mean(X*D==1)/mean(X==1) )         * mean(X==1) 
w0 <- (mean((1-X)*D==1)/mean((1-X)==1)) * ( 1 - mean((1-X)*D==1)/mean((1-X)==1) ) * mean((1-X)==1) 

w1R <- w1/(w1+w0)
w0R <- w0/(w1+w0)

#regression is weighted sum of different effects
deltaR <- delta1*w1R + delta0*w0R
#(even though there was no interaction term in the regression
# but that doesn't stop us from writing it down like this)


#Matching
w1M <- mean(X*D==1)/mean(D==1)
w0M <- mean((1-X)*D==1)/mean(D==1)

deltaM <- delta1*w1M + delta0*w0M

#compare the two
c(deltaR, deltaM)

#matching placed more weight on delta1
# depending on which of the two 
# Pr(D=1|X=1) and Pr(D=1|X=0)
# is larger
c(w1M,w0M)

#regression placed value based on variance of E(D|X)
# E(D|X) = Pr(D=1|X)
# this was similar for both
c(w1R,w0R)

#heterogeneity on it's own does not create a lot of problems
# it is heterogeneity PLUS different weights that will lead to differences


# PSM and IPW #####
# source: https://mixtape.scunning.com/matching-and-subclassification.html

read_data <- function(df)
{
  full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", 
                     df, sep = "")
  df <- read_dta(full_path)
  return(df)
}

nsw_dw <- read_data("nsw_mixtape.dta")
#look at the summary statistics of treated
nsw_dw %>%  filter(treat == 1) %>% summary(re78)
mean1 <- nsw_dw %>% filter(treat == 1) %>% pull(re78) %>% mean()
nsw_dw$y1 <- mean1

#look at the summary statistics of controls
nsw_dw %>%   filter(treat == 0) %>% summary(re78)
mean0 <- nsw_dw %>%   filter(treat == 0) %>% pull(re78) %>% mean()
nsw_dw$y0 <- mean0

#raw mean
ate <- unique(nsw_dw$y1 - nsw_dw$y0)
# remove y1,y0
nsw_dw <- nsw_dw %>%   filter(treat == 1) %>% select(-y1, -y0)

nsw_dw_cpscontrol <- read_data("cps_mixtape.dta") %>% 
  bind_rows(nsw_dw) %>% 
  mutate(agesq = age^2,
         agecube = age^3,
         educsq = educ*educ,
         u74 = case_when(re74 == 0 ~ 1, TRUE ~ 0),
         u75 = case_when(re75 == 0 ~ 1, TRUE ~ 0),
         interaction1 = educ*re74,
         re74sq = re74^2,
         re75sq = re75^2,
         interaction2 = u74*hisp)

# estimation of logit regression
logit_nsw <- glm(treat ~ age + agesq + agecube + educ + educsq + 
                   marr + nodegree + black + hisp + re74 + re75 + u74 +
                   u75 + interaction1, family = binomial(link = "logit"), 
                 data = nsw_dw_cpscontrol)

#create a new variable "pscore"
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% mutate(pscore = logit_nsw$fitted.values)

# mean pscore 
pscore_control <- nsw_dw_cpscontrol %>% filter(treat == 0) %>% pull(pscore) %>% mean()
pscore_treated <- nsw_dw_cpscontrol %>% filter(treat == 1) %>% pull(pscore) %>% mean()

# histogram of pscores
nsw_dw_cpscontrol %>% filter(treat == 0) %>% ggplot() + geom_histogram(aes(x = pscore))
nsw_dw_cpscontrol %>% filter(treat == 1) %>% ggplot() + geom_histogram(aes(x = pscore))


#continuation
N <- nrow(nsw_dw_cpscontrol)
#- Manual with non-normalized weights using all data
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>%  mutate(d1 = treat/pscore,
                                                   d0 = (1-treat)/(1-pscore))

#save denominators (for weighting)
s1 <- sum(nsw_dw_cpscontrol$d1)
s0 <- sum(nsw_dw_cpscontrol$d0)


nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  mutate(y1 = treat * re78/pscore,
         y0 = (1-treat) * re78/(1-pscore),
         ht = y1 - y0)

#- Manual with normalized weights
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  mutate(y1 = (treat*re78/pscore)/(s1/N),
         y0 = ((1-treat)*re78/(1-pscore))/(s0/N),
         norm = y1 - y0)

#calculate means
nsw_dw_cpscontrol %>%  pull(ht) %>% mean()
nsw_dw_cpscontrol %>% pull(norm) %>% mean()
#these numbers are huge!


#-- trimming propensity score
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  select(-d1, -d0, -y1, -y0, -ht, -norm) %>% 
  filter(!(pscore >= 0.9)) %>% 
  filter(!(pscore <= 0.1))

N <- nrow(nsw_dw_cpscontrol)

#- Manual with non-normalized weights using trimmed data
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  mutate(d1 = treat/pscore,
         d0 = (1-treat)/(1-pscore))

s1 <- sum(nsw_dw_cpscontrol$d1)
s0 <- sum(nsw_dw_cpscontrol$d0)

nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  mutate(y1 = treat * re78/pscore,
         y0 = (1-treat) * re78/(1-pscore),
         ht = y1 - y0)

#- Manual with normalized weights with trimmed data
nsw_dw_cpscontrol <- nsw_dw_cpscontrol %>% 
  mutate(y1 = (treat*re78/pscore)/(s1/N),
         y0 = ((1-treat)*re78/(1-pscore))/(s0/N),
         norm = y1 - y0)

nsw_dw_cpscontrol %>% pull(ht) %>% mean()
nsw_dw_cpscontrol %>% pull(norm) %>% mean()
#much more reasonable numbers, closer to LaLonde (1986) numbers


# Matching in R "MatchIt package" #####

# https://cran.r-project.org/web/packages/MatchIt/vignettes/MatchIt.html#introduction
data("lalonde")
head(lalonde)

m.out0 <- matchit(treat ~ age + educ + race + married + 
                    nodegree + re74 + re75, data = lalonde,
                    method = NULL, distance = "glm")

# Checking balance prior to matching
summary(m.out0)


# 1:1 NN PS matching w/o replacement
m.out1 <- matchit(treat ~ age + educ + race + married + 
                  nodegree + re74 + re75, data = lalonde,
                  method = "nearest", distance = "glm")
m.out1

# Checking balance after NN matching
summary(m.out1, un = FALSE)

#distribution of propensity scores
plot(m.out1, type = "jitter", interactive = FALSE)

#qq plots
plot(m.out1, type = "qq", interactive = FALSE,
     which.xs = c("age", "married", "re75"))


# Full matching on a probit PS
m.out2 <- matchit(treat ~ age + educ + race + married + 
                    nodegree + re74 + re75, data = lalonde,
                  method = "full", distance = "glm", link = "probit")
m.out2

# Checking balance after full matching
summary(m.out2, un = FALSE)

# Balance now look much better
plot(summary(m.out2))

#look at the matched data
m.data1 <- match.data(m.out1)
head(m.data1)

#regression estimation from m.out1
fit1 <- lm(re78 ~ treat + age + educ + race + married + nodegree + 
             re74 + re75, data = m.data1, weights = weights)

coeftest(fit1, vcov. = vcovCL, cluster = ~subclass)


m.data2 <- match.data(m.out2)
#regression estimation from m.out2 (full matching)
fit2 <- lm(re78 ~ treat + age + educ + race + married + nodegree + 
             re74 + re75, data = m.data2, weights = weights)

coeftest(fit2, vcov. = vcovCL, cluster = ~subclass)
# the second matching achieved better balance
# so we tend to believe this more


# Accessing balance in R "MatchIt package" #####

# The purpose of all these graphs is to do one thing:
# [[[Compare the distributions between treated and matched controls.]]]
# Visualization is often a tool that works quite well here.
# Our eyes are typically quite good in spotting the differences.


data("lalonde", package = "MatchIt")
#Full matching on a logistic regression PS
m.out <- matchit(treat ~ age + educ + race + married + 
                   nodegree + re74 + re75, data = lalonde,
                 method = "full")
m.out
summary(m.out, addlvariables = ~ I(age^2) + I(re74==0) + 
          I(re75==0) + educ:race)

m.sum <- summary(m.out, addlvariables = ~ I(age^2) + I(re74==0) + 
                   I(re75==0) + educ:race)
plot(m.sum, var.order = "unmatched")

#eQQ plot
plot(m.out, type = "qq", which.xs = c("age", "nodegree", "re74"))

#eCDF plot
plot(m.out, type = "ecdf", which.xs = c("educ", "married", "re75"))

#Subclassification on a logistic regression PS
s.out <- matchit(treat ~ age + educ + race + married + 
                   nodegree + re74 + re75, data = lalonde,
                 method = "subclass", subclass = 4)
s.out
summary(s.out)
summary(s.out, subclass = TRUE, un = FALSE)

s <- summary(s.out, subclass = TRUE)
plot(s, var.order = "unmatched", abs = FALSE)

plot(s.out, type = "histogram", which.xs = c("educ", "married", "re75"),
     subclass = 1)


#library("cobalt") #library for beautiful  balance graphs
bal.tab(m.out, un = TRUE, stats = c("m", "v", "ks"))

bal.tab(m.out, un = TRUE, stats = c("m", "v", "ks"))


#Nearest neighbor (NN) matching on the PS
m.out2 <- matchit(treat ~ age + educ + race + married + 
                    nodegree + re74 + re75, data = lalonde)

#Balance on covariates after full and NN matching
bal.tab(treat ~ age + educ + race + married + 
          nodegree + re74 + re75, data = lalonde, 
        un = TRUE, weights = list(full = m.out, nn = m.out2))


love.plot(m.out, binary = "std")

love.plot(m.out, stats = c("m", "ks"), poly = 2, abs = TRUE,
          weights = list(nn = m.out2),
          drop.distance = TRUE, thresholds = c(m = .1),
          var.order = "unadjusted", binary = "std",
          shapes = c("triangle", "square", "circle"), 
          colors = c("blue", "darkgreen", "red"),
          sample.names = c("Full Matching", "NN Matching", "Original"),
          position = "bottom")

#Density plot for continuous variables
bal.plot(m.out, var.name = "educ", which = "both")

#Bar graph for categorical variables
bal.plot(m.out, var.name = "race", which = "both")

#Mirrored histogram
bal.plot(m.out, var.name = "distance", which = "both",
         type = "histogram", mirror = TRUE)