# Loading packages ##### #"pacman" is a package that makes loading easier if (!require("pacman")) install.packages("pacman") pacman::p_load(datasets, ggplot2, faraway, readxl, devtools,ggpubr,tidysynth,ISLR2, glmnet,reshape2, RColorBrewer,plot3D,dplyr,parallel, randomForestSRC,ggRandomForests, DoubleML,mlr3,mlr3learners) setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # SCM ##### # This illustration is from: #https://github.com/edunford/tidysynth # California tabacco example # it beautifies the synth package # Abadie, Alberto, Alexis Diamond, and Jens Hainmueller. #"Synthetic control methods for comparative case studies: Estimating the #effect of California’s tobacco control program." Journal of the #American statistical Association 105.490 (2010): 493-505. require(tidysynth) data("smoking") smoking %>% dplyr::glimpse() smoking_out <- smoking %>% # initial the synthetic control object synthetic_control(outcome = cigsale, # outcome unit = state, # unit index in the panel data time = year, # time index in the panel data i_unit = "California", # unit where the intervention occurred i_time = 1988, # time period when the intervention occurred generate_placebos=T # generate placebo synthetic controls (for inference) ) %>% # Generate the aggregate predictors used to fit the weights # average log income, retail price of cigarettes, and proportion of the # population between 15 and 24 years of age from 1980 - 1988 generate_predictor(time_window = 1980:1988, ln_income = mean(lnincome, na.rm = T), ret_price = mean(retprice, na.rm = T), youth = mean(age15to24, na.rm = T)) %>% # average beer consumption in the donor pool from 1984 - 1988 generate_predictor(time_window = 1984:1988, beer_sales = mean(beer, na.rm = T)) %>% # Lagged cigarette sales generate_predictor(time_window = 1975, cigsale_1975 = cigsale) %>% generate_predictor(time_window = 1980, cigsale_1980 = cigsale) %>% generate_predictor(time_window = 1988, cigsale_1988 = cigsale) %>% # Generate the fitted weights for the synthetic control generate_weights(optimization_window = 1970:1988, # time to use in the optimization task margin_ipop = 5e-04, sigf_ipop = 7,bound_ipop = 6 # optimizer options #margin_ipop = .02, sigf_ipop = 7,bound_ipop = 6 # optimizer options ) %>% # Generate the synthetic control generate_control() # output of the analysis smoking_out %>% plot_trends() smoking_out %>% plot_differences() smoking_out %>% grab_balance_table() smoking_out %>% plot_placebos() smoking_out %>% plot_placebos(prune = FALSE) smoking_out %>% plot_mspe_ratio() smoking_out %>% grab_signficance() smoking_out %>% grab_unit_weights() smoking_out %>% grab_predictor_weights() # Regularization ##### # Lab: Linear Models and Regularization Methods # taken from https://www.statlearning.com/resources-second-edition # [comments my own] ### Choosing Among Models Using the Validation-Set Approach and Cross-Validation ## Ridge Regression and the Lasso ### # first we create quantitative inputs x <- model.matrix(Salary ~ ., Hitters)[, -1] #matrix x is without intercept y <- Hitters$Salary ### Ridge Regression ### # create a grid for regularization parameter grid <- 10^seq(10, -2, length = 100) #run Ridge regression ridge.mod <- glmnet(x, y, alpha = 0, lambda = grid) ### #we have set of coeffients for every value of lambda in grid dim(coef(ridge.mod)) #look at the 50th value ridge.mod$lambda[50] coef(ridge.mod)[, 50] sqrt(sum(coef(ridge.mod)[-1, 50]^2)) #look at the 60th value ridge.mod$lambda[60] coef(ridge.mod)[, 60] sqrt(sum(coef(ridge.mod)[-1, 60]^2)) #smaller penalty leads to larger norm # regression coefficients for the value of labda equal to 50 predict(ridge.mod, s = 50, type = "coefficients")[1:20, ] set.seed(1) # take random half of the data and label it as train data train <- sample(1:nrow(x), nrow(x) / 2) test <- (-train) y.test <- y[test] ### #estimate the model on the train data ridge.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = grid, thresh = 1e-12) #use the estimated model for prediction on the test data ridge.pred <- predict(ridge.mod, s = 4, newx = x[test, ]) # test data error (out-of-sample error) mean((ridge.pred - y.test)^2) # train data error (in-sample error) mean((mean(y[train]) - y.test)^2) ### #now use much larger penalty lambda ridge.pred <- predict(ridge.mod, s = 1e10, newx = x[test, ]) mean((ridge.pred - y.test)^2) ### ridge.pred <- predict(ridge.mod, s = 0, newx = x[test, ], exact = T, x = x[train, ], y = y[train]) # test data error is worse than it was before mean((ridge.pred - y.test)^2) # ridge with lambda = 0 is the same as OLS lm(y ~ x, subset = train) predict(ridge.mod, s = 0, exact = T, type = "coefficients", x = x[train, ], y = y[train])[1:20, ] set.seed(1) cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0) plot(cv.out) #what is the best penalty? bestlam <- cv.out$lambda.min bestlam ### ridge.pred <- predict(ridge.mod, s = bestlam, newx = x[test, ]) mean((ridge.pred - y.test)^2) #this is the best we can get ### out <- glmnet(x, y, alpha = 0) # look at the coeficients predict(out, type = "coefficients", s = bestlam)[1:20, ] # none is zero ### The Lasso ### lasso.mod <- glmnet(x[train, ], y[train], alpha = 1, lambda = grid) plot(lasso.mod) #cross-validation set.seed(1) cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1) plot(cv.out) bestlam <- cv.out$lambda.min lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test, ]) mean((lasso.pred - y.test)^2) #this is the test error #lasso automatically makes some coefficients zero out <- glmnet(x, y, alpha = 1, lambda = grid) lasso.coef <- predict(out, type = "coefficients", s = bestlam)[1:20, ] lasso.coef lasso.coef[lasso.coef != 0] # Regression Tree and Random forest ##### # Based on https://arxiv.org/pdf/1501.07196.pdf # Modified theme_set(theme_bw()) # A ggplot2 theme with white background # Load the Boston Housing data data(Boston, package="MASS") # Check the variables ?Boston # Set modes correctly. For binary variables: transform to logical Boston$chas <- as.logical(Boston$chas) # Use reshape2::melt to transform the data into long format. dta <- melt(Boston, id.vars=c("medv","chas")) # plot panels for each covariate colored by the logical chas variable. ggplot(dta, aes(x=medv, y=value, color=chas))+ geom_point(alpha=.4)+ geom_rug(data=dta %>% filter(is.na(value)))+ labs(y="", x="medv") + scale_color_brewer(palette="Set2")+ facet_wrap(~variable, scales="free_y", ncol=3) # Load the data, from the call: rfsrc_Boston <- rfsrc(medv~., importance="TRUE", block.size = 1, data=Boston) # print the forest summary rfsrc_Boston # plot the out-of-bad error rate and variable importance plot(rfsrc_Boston) # Load the data, from the call: varsel_Boston <- var.select(rfsrc_Boston) # Save the gg_minimal_depth object for later use. gg_md <- gg_minimal_depth(varsel_Boston) # plot the object plot(gg_md)#, lbls=st.labs) #Partial effects partial_Boston <- plot.variable(rfsrc_Boston, xvar=gg_md$topvars, partial=TRUE, sorted=FALSE, show.plots = TRUE ) #lower socio-economic status (in %) partial_Boston <- plot.variable(rfsrc_Boston, xvar="lstat", partial=TRUE, sorted=FALSE, show.plots = TRUE ) # Double ML ##### # neat and clean library for performing DML # https://arxiv.org/pdf/2103.09603.pdf alpha = 0.5 n_obs = 500 n_vars = 20 set.seed(1234) #creates a dataset for partial linear regression demonstration data_plr = make_plr_CCDDHNR2018(alpha = alpha, n_obs = n_obs, dim_x = n_vars, return_type = "data.table") #creates a data object that this library works with obj_dml_data = DoubleMLData$new(data_plr, y_col = "y", d_cols = "d") #turn off warnings lgr::get_logger("mlr3")$set_threshold("warn") # Initialize a random forests learner with specified parameters ml_g = lrn("regr.ranger", num.trees = 100, mtry = n_vars, min.node.size = 2, max.depth = 5) ml_m = lrn("regr.ranger", num.trees = 100, mtry = n_vars, min.node.size = 2, max.depth = 5) #use two-fold cross-fitting doubleml_plr = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m, n_folds = 2, score = "IV-type") doubleml_plr$fit() #ok, we do recover the true parameter 0.5 doubleml_plr$summary() # A bonus ##### library(datasauRus) library(ggplot2) library(dplyr) datasaurus_dozen %>% group_by(dataset) %>% summarize( mean_x = mean(x), mean_y = mean(y), std_dev_x = sd(x), std_dev_y = sd(y), corr_x_y = cor(x, y)) ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+ geom_point()+ geom_smooth(method="lm",formula="y~x")+ theme_void()+ theme(legend.position = "none")+ facet_wrap(~dataset, ncol=3) # Lesson to take: # Same summary stats, same regression line...