# Loading packages #####
#"pacman" is a package that makes loading easier
if (!require("pacman")) install.packages("pacman")
pacman::p_load(datasets, ggplot2, faraway, readxl, 
               devtools,ggpubr,tidysynth,ISLR2,
               glmnet,reshape2,
               RColorBrewer,plot3D,dplyr,parallel,
               randomForestSRC,ggRandomForests,
               DoubleML,mlr3,mlr3learners)
setwd("/Users/lukaslaffers/Dropbox/econx phd/code")
# SCM #####
# This illustration is from:
#https://github.com/edunford/tidysynth
# California tabacco example
# it beautifies the synth package

# Abadie, Alberto, Alexis Diamond, and Jens Hainmueller. 
#"Synthetic control methods for comparative case studies: Estimating the 
#effect of California’s tobacco control program." Journal of the 
#American statistical Association 105.490 (2010): 493-505.


require(tidysynth)
data("smoking")
smoking %>% dplyr::glimpse()


smoking_out <-
  
  smoking %>%
  
  # initial the synthetic control object
  synthetic_control(outcome = cigsale, # outcome
                    unit = state, # unit index in the panel data
                    time = year, # time index in the panel data
                    i_unit = "California", # unit where the intervention occurred
                    i_time = 1988, # time period when the intervention occurred
                    generate_placebos=T # generate placebo synthetic controls (for inference)
  ) %>%
  
  # Generate the aggregate predictors used to fit the weights
  
  # average log income, retail price of cigarettes, and proportion of the
  # population between 15 and 24 years of age from 1980 - 1988
  generate_predictor(time_window = 1980:1988,
                     ln_income = mean(lnincome, na.rm = T),
                     ret_price = mean(retprice, na.rm = T),
                     youth = mean(age15to24, na.rm = T)) %>%
  
  # average beer consumption in the donor pool from 1984 - 1988
  generate_predictor(time_window = 1984:1988,
                     beer_sales = mean(beer, na.rm = T)) %>%
  
  # Lagged cigarette sales 
  generate_predictor(time_window = 1975,
                     cigsale_1975 = cigsale) %>%
  generate_predictor(time_window = 1980,
                     cigsale_1980 = cigsale) %>%
  generate_predictor(time_window = 1988,
                     cigsale_1988 = cigsale) %>%
  
  
  # Generate the fitted weights for the synthetic control
  generate_weights(optimization_window = 1970:1988, # time to use in the optimization task
                   margin_ipop = 5e-04, sigf_ipop = 7,bound_ipop = 6 # optimizer options
                   #margin_ipop = .02, sigf_ipop = 7,bound_ipop = 6 # optimizer options
  ) %>%
  
  # Generate the synthetic control
  generate_control()

# output of the analysis
smoking_out %>% plot_trends()
smoking_out %>% plot_differences()
smoking_out %>% grab_balance_table()
smoking_out %>% plot_placebos()
smoking_out %>% plot_placebos(prune = FALSE)
smoking_out %>% plot_mspe_ratio()
smoking_out %>% grab_signficance()
smoking_out %>% grab_unit_weights()
smoking_out %>% grab_predictor_weights()

# Regularization #####
# Lab: Linear Models and Regularization Methods
# taken from https://www.statlearning.com/resources-second-edition
# [comments my own]

### Choosing Among Models Using the Validation-Set Approach and Cross-Validation
## Ridge Regression and the Lasso

###
# first we create quantitative inputs
x <- model.matrix(Salary ~ ., Hitters)[, -1]
#matrix x is without intercept
y <- Hitters$Salary

### Ridge Regression

###
# create a grid for regularization parameter
grid <- 10^seq(10, -2, length = 100)
#run Ridge regression
ridge.mod <- glmnet(x, y, alpha = 0, lambda = grid)
###
#we have set of coeffients for every value of lambda in grid
dim(coef(ridge.mod))

#look at the 50th value
ridge.mod$lambda[50]
coef(ridge.mod)[, 50]
sqrt(sum(coef(ridge.mod)[-1, 50]^2))

#look at the 60th value
ridge.mod$lambda[60]
coef(ridge.mod)[, 60]
sqrt(sum(coef(ridge.mod)[-1, 60]^2))
#smaller penalty leads to larger norm

# regression coefficients for the value of labda equal to 50
predict(ridge.mod, s = 50, type = "coefficients")[1:20, ]


set.seed(1)
# take random half of the data and label it as train data
train <- sample(1:nrow(x), nrow(x) / 2)
test <- (-train)
y.test <- y[test]
###

#estimate the model on the train data
ridge.mod <- glmnet(x[train, ], y[train], alpha = 0,
                    lambda = grid, thresh = 1e-12)

#use the estimated model for prediction on the test data
ridge.pred <- predict(ridge.mod, s = 4, newx = x[test, ])
# test data error (out-of-sample error)
mean((ridge.pred - y.test)^2)

# train data error (in-sample error)
mean((mean(y[train]) - y.test)^2)
###

#now use much larger penalty lambda
ridge.pred <- predict(ridge.mod, s = 1e10, newx = x[test, ])
mean((ridge.pred - y.test)^2)
###
ridge.pred <- predict(ridge.mod, s = 0, newx = x[test, ],
                      exact = T, x = x[train, ], y = y[train])
# test data error is worse than it was before
mean((ridge.pred - y.test)^2)


# ridge with lambda = 0 is the same as OLS
lm(y ~ x, subset = train)
predict(ridge.mod, s = 0, exact = T, type = "coefficients",
        x = x[train, ], y = y[train])[1:20, ]


set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0)
plot(cv.out)
#what is the best penalty?
bestlam <- cv.out$lambda.min
bestlam
###
ridge.pred <- predict(ridge.mod, s = bestlam,
                      newx = x[test, ])
mean((ridge.pred - y.test)^2)
#this is the best we can get

###
out <- glmnet(x, y, alpha = 0)
# look at the coeficients
predict(out, type = "coefficients", s = bestlam)[1:20, ]
# none is zero


### The Lasso

###
lasso.mod <- glmnet(x[train, ], y[train], alpha = 1,
                    lambda = grid)
plot(lasso.mod)

#cross-validation
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam,
                      newx = x[test, ])
mean((lasso.pred - y.test)^2)
#this is the test error


#lasso automatically makes some coefficients zero
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients",
                      s = bestlam)[1:20, ]
lasso.coef
lasso.coef[lasso.coef != 0]


# Regression Tree and Random forest #####
# Based on https://arxiv.org/pdf/1501.07196.pdf
# Modified

theme_set(theme_bw())     # A ggplot2 theme with white background

# Load the Boston Housing data
data(Boston, package="MASS")

# Check the variables
?Boston

# Set modes correctly. For binary variables: transform to logical
Boston$chas <- as.logical(Boston$chas)

# Use reshape2::melt to transform the data into long format.
dta <- melt(Boston, id.vars=c("medv","chas"))

# plot panels for each covariate colored by the logical chas variable.
ggplot(dta, aes(x=medv, y=value, color=chas))+
   geom_point(alpha=.4)+
   geom_rug(data=dta %>% filter(is.na(value)))+
   labs(y="", x="medv") +
   scale_color_brewer(palette="Set2")+
   facet_wrap(~variable, scales="free_y", ncol=3)

# Load the data, from the call:
rfsrc_Boston <- rfsrc(medv~., 
                      importance="TRUE", 
                      block.size = 1,
                      data=Boston)

# print the forest summary
rfsrc_Boston

# plot the out-of-bad error rate and variable importance
plot(rfsrc_Boston)


# Load the data, from the call:
varsel_Boston <- var.select(rfsrc_Boston)

# Save the gg_minimal_depth object for later use.
gg_md <- gg_minimal_depth(varsel_Boston)

# plot the object
plot(gg_md)#, lbls=st.labs)


#Partial effects
partial_Boston <- plot.variable(rfsrc_Boston,
                                  xvar=gg_md$topvars,
                                  partial=TRUE, sorted=FALSE,
                                  show.plots = TRUE )

#lower socio-economic status (in %)
partial_Boston <- plot.variable(rfsrc_Boston,
                                xvar="lstat",
                                partial=TRUE, sorted=FALSE,
                                show.plots = TRUE )


# Double ML #####

# neat and clean library for performing DML
# https://arxiv.org/pdf/2103.09603.pdf

alpha = 0.5
n_obs = 500
n_vars = 20
set.seed(1234)

#creates a dataset for partial linear regression demonstration
data_plr = make_plr_CCDDHNR2018(alpha = alpha, n_obs = n_obs, dim_x = n_vars,
                                return_type = "data.table")

#creates a data object that this library works with
obj_dml_data = DoubleMLData$new(data_plr, y_col = "y", d_cols = "d")

#turn off warnings
lgr::get_logger("mlr3")$set_threshold("warn")

# Initialize a random forests learner with specified parameters
ml_g = lrn("regr.ranger", num.trees = 100, mtry = n_vars, min.node.size = 2,
           max.depth = 5)
ml_m = lrn("regr.ranger", num.trees = 100, mtry = n_vars, min.node.size = 2,
           max.depth = 5)

#use two-fold cross-fitting
doubleml_plr = DoubleMLPLR$new(obj_dml_data,
                               ml_g, ml_m,
                               n_folds = 2,
                               score = "IV-type")

doubleml_plr$fit()

#ok, we do recover the true parameter 0.5
doubleml_plr$summary()


# A bonus #####
library(datasauRus)
library(ggplot2)
library(dplyr)

datasaurus_dozen %>%
  group_by(dataset) %>%
  summarize(
    mean_x    = mean(x),
    mean_y    = mean(y),
    std_dev_x = sd(x),
    std_dev_y = sd(y),
    corr_x_y  = cor(x, y))

ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+
  geom_point()+
  geom_smooth(method="lm",formula="y~x")+
  theme_void()+
  theme(legend.position = "none")+
  facet_wrap(~dataset, ncol=3)
# Lesson to take:
# Same summary stats, same regression line...