# Loading packages #####
#"pacman" is a package that makes loading easier
if (!require("pacman")) install.packages("pacman")
pacman::p_load(datasets, ggplot2, faraway, readxl, 
               lattice, broom, gganimate,dplyr,
               devtools,ggpubr,
               tidyverse,broom,readxl,httr,
               wooldridge,multiway,lmtest,
               causalweight,
               DRDID,did, 
               fixest,data.table)
setwd("/Users/lukaslaffers/Dropbox/econx phd/code")

# RDD demonstration #####
set.seed(874233)
N <- 100
Test <- abs(rnorm(N,mean=120, sd=60))
D   <- 1*(Test >120)
Grade <- 6 + 0.01*Test + D*1 + 0.3*rnorm(N)
MeanGrade <- 6 + 0.01*Test + D*1

df <- data.frame(Test,Grade,MeanGrade)


p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration")
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd1.pdf")

p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd2.pdf")

p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  geom_line(data = subset(df,Test<120),aes(x=Test,y=MeanGrade),col="yellow",size=1.5)+
  geom_line(data = subset(df,Test>120.1),aes(x=Test,y=MeanGrade),col="blue",size=1.5)+
  xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd3.pdf")


p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  stat_smooth(data = subset(df,Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+
  stat_smooth(data = subset(df,Test>120.1),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (5th degree polynomial)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd4.pdf")


df2 <- df

df2[101,] <- c(119,8,NA)
df2[102,] <- c(121,7,NA)

p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  geom_point(data=subset(df2,Grade==7), aes(x=Test,y=Grade),col="red",size=2)+
  geom_point(data=subset(df2,Grade==8), aes(x=Test,y=Grade),col="red",size=2)+
  stat_smooth(data = subset(df2,Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+
  stat_smooth(data = subset(df2,Test>120.1),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (5th degree polynomial) + 2 obs")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd5.pdf")


p <- ggplot()+
  geom_point(data=df, aes(x=Test,y=Grade))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (Zoomed)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)+
  xlim(100,140)
  p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd6.pdf")


p <- ggplot()+
  geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+
  stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x)+
  stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x)+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (linear)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd7.pdf")


p <- ggplot()+
  geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+
  stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x+I(x^2))+
  stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x+I(x^2))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (quadratic)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd8.pdf")


p <- ggplot()+
  geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+
  stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3))+
  stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (cubic)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd9.pdf")


p <- ggplot()+
  geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+
  stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4))+
  stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", 
              formula= y~x+I(x^2)+I(x^3)+I(x^4))+
  xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (quartic)")+
  geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)
p
ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd10.pdf")


# Kernels ####

x <- seq(-2,2,by=0.01)
k1 <- dnorm(x)
k2 <- pmax(0*x+1 - 4*(x< -1) - 4*(x>1),0)
k3 <- pmax(1-abs(x),0)
k4 <- pmax(0.75*(1-x^2),0)

df <- data.frame(x,k1,k2,k3,k4)

p1 <- ggplot() + geom_line(data=df,aes(x=x,y=k1))+ylim(0,1)+ggtitle("Gaussian")+theme(axis.title.y=element_blank())
p2 <- ggplot() + geom_line(data=df,aes(x=x,y=k2))+ylim(0,1)+ggtitle("Uniform")+theme(axis.title.y=element_blank())
p3 <- ggplot() + geom_line(data=df,aes(x=x,y=k3))+ggtitle("Triangular")+theme(axis.title.y=element_blank())
p4 <- ggplot() + geom_line(data=df,aes(x=x,y=k4))+ggtitle("Epanechnikov")+theme(axis.title.y=element_blank())

plot <- ggarrange(p1,p2,p3,p4,ncol = 4, nrow = 1)

annotate_figure(plot, top = text_grob("Different kernels", face = "bold", size = 14))
ggsave(plot = plot, width = 8, height = 2, dpi = 300, filename = "kernels.pdf")

# DiD - demonstration, Simple 2x2 #####

# https://lost-stats.github.io/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html  


# Download the Excel file from a URL
tf <- tempfile(fileext = ".xlsx")
GET(
  "https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Two_by_Two_Difference_in_Difference/did_crime.xlsx",
  write_disk(tf)
)
DiD <- read_excel(tf)

#Data are collapsed already. 

DiD <- DiD %>%
  mutate(after = year >= 2014) %>%
  mutate(treatafter = after*treat)

mt <- ggplot(DiD,aes(x=year, y=murder, color = as.factor(treat))) +
  geom_point(size=3)+geom_line() +
  geom_vline(xintercept=2014,lty=4) +
  labs(title="Murder and Time", x="Year", y="Murder Rate")
mt

DiD$D <- as.factor(DiD$year - 2014)

summary(lm(murder ~ D, data=DiD))


# DiD 2x2 
reg<-lm(murder ~ treat+after+treatafter, data = DiD)
summary(reg)

#look at what the different coefficients mean
y11 <- mean(DiD$murder[DiD$treat==1 & DiD$after==1])
y10 <- mean(DiD$murder[DiD$treat==1 & DiD$after==0])
y01 <- mean(DiD$murder[DiD$treat==0 & DiD$after==1])
y00 <- mean(DiD$murder[DiD$treat==0 & DiD$after==0])

reg$coefficients[1]
y00

reg$coefficients[2]
y10 - y00

reg$coefficients[3]
y01 - y00

reg$coefficients[4] #this is the DiD estimator
(y11 - y10) - (y01 - y00)


# DiD - demonstration, Simple 2x2 (different library) #####

# Martin Huber's book (an early draft)
#https://drive.switch.ch/index.php/s/tNhKQmkGB48bjfz
# page 229

# load kielmc data
data(kielmc) 
attach(kielmc)
library(multiwayvcov)

#define variables
#real price of house measured in 1978 USD
Y <- rprice 
# house is close (D=1) of further away (D=0) from garbage incinerator
D <- nearinc 
T <-  y81
interact=D*T 
# DiD regression
did=lm(Y~D+T+interact ) 
# cluster: distance to center (cbd)
vcovCL=cluster.vcov (model=did , cluster=cbd)
# results with cluster st.error
coeftest(did, vcov=vcovCL)

# DiD - demonstration, Simple 2x2 with covariates (IPW based) #####

# Martin Huber's book (an early draft)
#https://drive.switch.ch/index.php/s/tNhKQmkGB48bjfz
# page 235

library(causalweight)
X <- cbind(area, rooms, baths)
set.seed(1)
out <- didweight(y=Y,d=D,t=T,x=X,boot=399,cluster=cbd)
out$effect ; out$se; out$pvalue


# DiD - with variation in the treatment time using "did" library #####
#based on Callaway, Brantly, and Pedro HC Sant’Anna. "Difference-in-differences with multiple time periods." 
#    Journal of Econometrics 225.2 (2021): 200-230.
# https://bcallaway11.github.io/did/
# https://bcallaway11.github.io/did/articles/did-basics.html
data(mpdta)

out <- att_gt(yname = "lemp",
              gname = "first.treat",
              idname = "countyreal",
              tname = "year",
              xformla = ~1,
              data = mpdta,
              est_method = "reg"
)

summary(out)

ggdid(out, ylim = c(-.25,.1))

es <- aggte(out, type = "dynamic")
ggdid(es)

group_effects <- aggte(out, type = "group")
summary(group_effects)


# DiD - event study plot #####
# https://lost-stats.github.io/Model_Estimation/Research_Design/event_study.html
# 
# Load and prepare data
dat = fread("https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Event_Study_DiD/bacon_example.csv") 

# Let's create a more user-friendly indicator of which states received treatment
dat[, treat := ifelse(is.na(`_nfd`), 0, 1)]

# Create a "time_to_treatment" variable for each state, so that treatment is
# relative for all treated units. For the never-treated (i.e. control) units,
# we'll arbitrarily set the "time_to_treatment" value at 0. This value 
# doesn't really matter, since it will be canceled by the treat==0 interaction
# anyway. But we do want to make sure they aren't NA, otherwise feols would drop 
# these never-treated observations at estimation time and our results will be 
# off.
dat[, time_to_treat := ifelse(treat==1, year - `_nfd`, 0)]

mod_twfe = feols(asmrs ~ i(time_to_treat, treat, ref = -1) + ## Our key interaction: time × treatment status
                   pcinc + asmrh + cases |                   ## Other controls
                   stfips + year,                            ## FEs
                   cluster = ~stfips,                        ## Clustered SEs
                   data = dat)

iplot(mod_twfe,
     xlab = 'Time to treatment',
     main = 'Event study: Staggered treatment (TWFE)')


# DiD - doubly robust estimator using "DRDID" library  #####
# doubly robust estimator based on
# Sant’Anna, Pedro H. C., and Zhao, Jun (2020), “Doubly Robust Difference-in-Differences Estimators”, Journal of Econometrics, Vol. 219 (1), pp. 101-122.
# https://pedrohcgs.github.io/DRDID/
# Panel data case
# Form the Lalonde sample with CPS comparison group
eval_lalonde_cps <- subset(nsw_long, nsw_long$treated == 0 | nsw_long$sample == 2)
# Further reduce sample to speed example
set.seed(123)
unit_random <- sample(unique(eval_lalonde_cps$id), 5000)
eval_lalonde_cps <- eval_lalonde_cps[eval_lalonde_cps$id %in% unit_random,]
# Implement improved DR locally efficient DID with panel data
drdid(yname="re", tname = "year", idname = "id", dname = "experimental",
      xformla= ~ age+ educ+ black+ married+ nodegree+ hisp+ re74,
      data = eval_lalonde_cps, panel = TRUE)
#Implement "traditional" DR locally efficient DID with panel data
drdid(yname="re", tname = "year", idname = "id", dname = "experimental",
      xformla= ~ age+ educ+ black+ married+ nodegree+ hisp+ re74,
      data = eval_lalonde_cps, panel = TRUE, estMethod = "trad")