# Loading packages ##### #"pacman" is a package that makes loading easier if (!require("pacman")) install.packages("pacman") pacman::p_load(datasets, ggplot2, faraway, readxl, lattice, broom, gganimate,dplyr, devtools,ggpubr, tidyverse,broom,readxl,httr, wooldridge,multiway,lmtest, causalweight, DRDID,did, fixest,data.table) setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # RDD demonstration ##### set.seed(874233) N <- 100 Test <- abs(rnorm(N,mean=120, sd=60)) D <- 1*(Test >120) Grade <- 6 + 0.01*Test + D*1 + 0.3*rnorm(N) MeanGrade <- 6 + 0.01*Test + D*1 df <- data.frame(Test,Grade,MeanGrade) p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration") ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd1.pdf") p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd2.pdf") p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ geom_line(data = subset(df,Test<120),aes(x=Test,y=MeanGrade),col="yellow",size=1.5)+ geom_line(data = subset(df,Test>120.1),aes(x=Test,y=MeanGrade),col="blue",size=1.5)+ xlab("Test score") + ylab("Grade")+ggtitle("Regression Discontinuity Design Demonstration")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd3.pdf") p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ stat_smooth(data = subset(df,Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+ stat_smooth(data = subset(df,Test>120.1),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (5th degree polynomial)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd4.pdf") df2 <- df df2[101,] <- c(119,8,NA) df2[102,] <- c(121,7,NA) p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ geom_point(data=subset(df2,Grade==7), aes(x=Test,y=Grade),col="red",size=2)+ geom_point(data=subset(df2,Grade==8), aes(x=Test,y=Grade),col="red",size=2)+ stat_smooth(data = subset(df2,Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+ stat_smooth(data = subset(df2,Test>120.1),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (5th degree polynomial) + 2 obs")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd5.pdf") p <- ggplot()+ geom_point(data=df, aes(x=Test,y=Grade))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (Zoomed)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1)+ xlim(100,140) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd6.pdf") p <- ggplot()+ geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+ stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x)+ stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x)+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (linear)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd7.pdf") p <- ggplot()+ geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+ stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x+I(x^2))+ stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x+I(x^2))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (quadratic)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd8.pdf") p <- ggplot()+ geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+ stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x+I(x^2)+I(x^3))+ stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x+I(x^2)+I(x^3))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (cubic)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd9.pdf") p <- ggplot()+ geom_point(data=subset(df,Test>100 & Test<140), aes(x=Test,y=Grade))+ stat_smooth(data = subset(df,Test>100 & Test<120),aes(x=Test,y=Grade),col="yellow",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4))+ stat_smooth(data = subset(df,Test>120.1 & Test<140),aes(x=Test,y=Grade),col="blue",size=1, method="lm", formula= y~x+I(x^2)+I(x^3)+I(x^4))+ xlab("Test score") + ylab("Grade")+ggtitle("RDD Demonstration (quartic)")+ geom_vline(xintercept = 120, linetype="solid",color = "red", size=1) p ggsave(plot = p, width = 5, height = 3, dpi = 300, filename = "rdd10.pdf") # Kernels #### x <- seq(-2,2,by=0.01) k1 <- dnorm(x) k2 <- pmax(0*x+1 - 4*(x< -1) - 4*(x>1),0) k3 <- pmax(1-abs(x),0) k4 <- pmax(0.75*(1-x^2),0) df <- data.frame(x,k1,k2,k3,k4) p1 <- ggplot() + geom_line(data=df,aes(x=x,y=k1))+ylim(0,1)+ggtitle("Gaussian")+theme(axis.title.y=element_blank()) p2 <- ggplot() + geom_line(data=df,aes(x=x,y=k2))+ylim(0,1)+ggtitle("Uniform")+theme(axis.title.y=element_blank()) p3 <- ggplot() + geom_line(data=df,aes(x=x,y=k3))+ggtitle("Triangular")+theme(axis.title.y=element_blank()) p4 <- ggplot() + geom_line(data=df,aes(x=x,y=k4))+ggtitle("Epanechnikov")+theme(axis.title.y=element_blank()) plot <- ggarrange(p1,p2,p3,p4,ncol = 4, nrow = 1) annotate_figure(plot, top = text_grob("Different kernels", face = "bold", size = 14)) ggsave(plot = plot, width = 8, height = 2, dpi = 300, filename = "kernels.pdf") # DiD - demonstration, Simple 2x2 ##### # https://lost-stats.github.io/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html # Download the Excel file from a URL tf <- tempfile(fileext = ".xlsx") GET( "https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Two_by_Two_Difference_in_Difference/did_crime.xlsx", write_disk(tf) ) DiD <- read_excel(tf) #Data are collapsed already. DiD <- DiD %>% mutate(after = year >= 2014) %>% mutate(treatafter = after*treat) mt <- ggplot(DiD,aes(x=year, y=murder, color = as.factor(treat))) + geom_point(size=3)+geom_line() + geom_vline(xintercept=2014,lty=4) + labs(title="Murder and Time", x="Year", y="Murder Rate") mt DiD$D <- as.factor(DiD$year - 2014) summary(lm(murder ~ D, data=DiD)) # DiD 2x2 reg<-lm(murder ~ treat+after+treatafter, data = DiD) summary(reg) #look at what the different coefficients mean y11 <- mean(DiD$murder[DiD$treat==1 & DiD$after==1]) y10 <- mean(DiD$murder[DiD$treat==1 & DiD$after==0]) y01 <- mean(DiD$murder[DiD$treat==0 & DiD$after==1]) y00 <- mean(DiD$murder[DiD$treat==0 & DiD$after==0]) reg$coefficients[1] y00 reg$coefficients[2] y10 - y00 reg$coefficients[3] y01 - y00 reg$coefficients[4] #this is the DiD estimator (y11 - y10) - (y01 - y00) # DiD - demonstration, Simple 2x2 (different library) ##### # Martin Huber's book (an early draft) #https://drive.switch.ch/index.php/s/tNhKQmkGB48bjfz # page 229 # load kielmc data data(kielmc) attach(kielmc) library(multiwayvcov) #define variables #real price of house measured in 1978 USD Y <- rprice # house is close (D=1) of further away (D=0) from garbage incinerator D <- nearinc T <- y81 interact=D*T # DiD regression did=lm(Y~D+T+interact ) # cluster: distance to center (cbd) vcovCL=cluster.vcov (model=did , cluster=cbd) # results with cluster st.error coeftest(did, vcov=vcovCL) # DiD - demonstration, Simple 2x2 with covariates (IPW based) ##### # Martin Huber's book (an early draft) #https://drive.switch.ch/index.php/s/tNhKQmkGB48bjfz # page 235 library(causalweight) X <- cbind(area, rooms, baths) set.seed(1) out <- didweight(y=Y,d=D,t=T,x=X,boot=399,cluster=cbd) out$effect ; out$se; out$pvalue # DiD - with variation in the treatment time using "did" library ##### #based on Callaway, Brantly, and Pedro HC Sant’Anna. "Difference-in-differences with multiple time periods." # Journal of Econometrics 225.2 (2021): 200-230. # https://bcallaway11.github.io/did/ # https://bcallaway11.github.io/did/articles/did-basics.html data(mpdta) out <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", xformla = ~1, data = mpdta, est_method = "reg" ) summary(out) ggdid(out, ylim = c(-.25,.1)) es <- aggte(out, type = "dynamic") ggdid(es) group_effects <- aggte(out, type = "group") summary(group_effects) # DiD - event study plot ##### # https://lost-stats.github.io/Model_Estimation/Research_Design/event_study.html # # Load and prepare data dat = fread("https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Event_Study_DiD/bacon_example.csv") # Let's create a more user-friendly indicator of which states received treatment dat[, treat := ifelse(is.na(`_nfd`), 0, 1)] # Create a "time_to_treatment" variable for each state, so that treatment is # relative for all treated units. For the never-treated (i.e. control) units, # we'll arbitrarily set the "time_to_treatment" value at 0. This value # doesn't really matter, since it will be canceled by the treat==0 interaction # anyway. But we do want to make sure they aren't NA, otherwise feols would drop # these never-treated observations at estimation time and our results will be # off. dat[, time_to_treat := ifelse(treat==1, year - `_nfd`, 0)] mod_twfe = feols(asmrs ~ i(time_to_treat, treat, ref = -1) + ## Our key interaction: time × treatment status pcinc + asmrh + cases | ## Other controls stfips + year, ## FEs cluster = ~stfips, ## Clustered SEs data = dat) iplot(mod_twfe, xlab = 'Time to treatment', main = 'Event study: Staggered treatment (TWFE)') # DiD - doubly robust estimator using "DRDID" library ##### # doubly robust estimator based on # Sant’Anna, Pedro H. C., and Zhao, Jun (2020), “Doubly Robust Difference-in-Differences Estimators”, Journal of Econometrics, Vol. 219 (1), pp. 101-122. # https://pedrohcgs.github.io/DRDID/ # Panel data case # Form the Lalonde sample with CPS comparison group eval_lalonde_cps <- subset(nsw_long, nsw_long$treated == 0 | nsw_long$sample == 2) # Further reduce sample to speed example set.seed(123) unit_random <- sample(unique(eval_lalonde_cps$id), 5000) eval_lalonde_cps <- eval_lalonde_cps[eval_lalonde_cps$id %in% unit_random,] # Implement improved DR locally efficient DID with panel data drdid(yname="re", tname = "year", idname = "id", dname = "experimental", xformla= ~ age+ educ+ black+ married+ nodegree+ hisp+ re74, data = eval_lalonde_cps, panel = TRUE) #Implement "traditional" DR locally efficient DID with panel data drdid(yname="re", tname = "year", idname = "id", dname = "experimental", xformla= ~ age+ educ+ black+ married+ nodegree+ hisp+ re74, data = eval_lalonde_cps, panel = TRUE, estMethod = "trad")