# REGRESSION BASICS # Loading packages ##### #"pacman" is a package that makes loading easier if (!require("pacman")) install.packages("pacman") pacman::p_load(datasets, ggplot2, faraway, readxl, HSAUR, MASS,lmtest,sandwich) # Basic linear regression ##### # How does a braking distance depend on the car's speed? #Plot the relationship plot(cars$speed,cars$dist) #Try to get familiar with ggplot2 library. # It's ability to produce beautiful graphs is unmatched. # This is the one book you want to look into: https://ggplot2-book.org ggplot(data=cars,aes(x=speed,y=dist)) + geom_point() #run a simple straight line through the data lmod <- lm(dist ~ speed, data=cars) summary(lmod) #there are many different lines that we can fit ggplot(data=cars,aes(x=speed,y=dist)) + geom_point() + geom_abline(slope=3, intercept=10, col="grey") + geom_abline(slope=5, intercept=-30, col="grey") + geom_abline(slope=8, intercept=-80, col="grey") + geom_abline(slope=lmod$coeff[2], intercept=lmod$coeff[1], col="blue") # or make use of built in options in ggplot2 library ggplot(data=cars,aes(x=speed,y=dist)) + geom_point() + geom_smooth(method='lm', formula= y~x, se=FALSE) # Returns to education ##### # # Data downloaded from https://davidcard.berkeley.edu/data_sets.html # from paper: # "Using Geographic Variation in College Proximity to Estimate the Return to Schooling". # In L.N. Christofides, E.K. Grant, and R. Swidinsky, editors, Aspects of Labor Market # Behaviour: Essays in Honour of John Vanderkamp. Toronto: University of Toronto Press, 1995. # Data also here: https://docs.google.com/spreadsheets/d/116KwP9SJUQ8LZHeIH-sTvlFFwCgkKIsz86exk1I7O1s/edit#gid=943363882 # save as card.xlsx # Inspired by Problem 4 from https://sites.google.com/view/microeconometricswithr/problems/problems-ch1 setwd("/Users/lukaslaffers/Dropbox/econx phd/code") # this uses xlsx package data_card <- read_xlsx("card.xlsx") lwage <- as.numeric(data_card$lwage76) educ <- as.numeric(data_card$ed76) age <- as.numeric(data_card$age76) # lived in south south <- as.numeric(data_card$south66) # define experience expi <- age - educ + 6 #create a dataframe df <- data.frame(cbind(lwage,educ,age,expi,south)) #plot the date plot(educ,lwage) summary(lm(lwage ~ educ)) #plot it together ggplot(aes(x=educ, y=lwage), data=df) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + geom_smooth(method = "lm", formula = y ~ x + I(x^2), col="red") #adding age does not change our paramater of interest much summary(lm(lwage ~ educ + age)) #plot lwage against experience plot(expi,lwage) summary(lm(lwage ~ expi)) #not significant #visual inspection confirms this ggplot(aes(x=expi, y=lwage), data=df) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + geom_smooth(method = "lm", formula = y ~ x + I(x^2), col="red") # why is the coefficient of educ so wildly different? summary(lm(lwage ~ educ + expi)) #they are highly correlated cor(educ,expi) #and what is going on here?? summary(lm(lwage ~ educ + age + expi)) # Interaction terms ##### # ...precious example continued ggplot(aes(x=educ, y=lwage), data=df) + geom_point(aes(colour=factor(south))) #you may want to add some noise in order to avoid overplotting ggplot(aes(x=educ, y=lwage, colour=factor(south)), data=df) + geom_jitter(width = 0.5, height = 0) # is the association between education and wages different in south than it is in north? ggplot(aes(x=educ, y=lwage, colour=factor(south)), data=df) + geom_jitter(width = 0.5, height = 0) + geom_smooth(method = "lm", formula = y ~ x) #overall lm(lwage ~ educ, data=df)$coeff[2] #5.21% #south only lm(lwage ~ educ, data=df %>% filter(south==1))$coeff[2] #5.69% #north only lm(lwage ~ educ, data=df %>% filter(south==0))$coeff[2] #3.18% #regression with interaction lm(lwage ~ educ*south, data=df) #3.19 + 2.50 = 5.69 # regression is the same as running the two regressions separately #how about education x age interaction # is the association between wages and education different for old/young ggplot(aes(x=educ, y=lwage, colour=factor(age>median(age))), data=df) + geom_jitter(width = 0.5, height = 0) + geom_smooth(method = "lm", formula = y ~ x) # intercept does not appear to be very different summary(lm(lwage ~ educ*I(age>28), data=df)) # and this is confirmed by this regression #interaction term is 2.4%, so the association between educ and lwages #is stronger for older workers (as seen on the figure) # by dichotomizing age into two categories we lose information #let's make use of the full information here summary(lm(lwage ~ educ*age, data=df)) #this is somewhat difficult to interpret # there isn't a person who is 0years old mean(df$age) df$agec <- age-28 summary(lm(lwage ~ educ*agec, data=df)) #here education coefficient is the association for someone who is 28years old # interaction term stays the same, R2 stays the same (do you see why?) # how about the age coefficient? df$educc <- df$educ - 12 summary(lm(lwage ~ educc*agec, data=df)) #now we can interpret age coefficient. # Pearl's Simpson machine ##### # Source: http://dagitty.net/learn/simpson/index.html # adding variables changes the model's structure. # Consecutively adding variables may change paramater signs in every step!!! set.seed(89374) simpson.simulator <- function(N,s,ce){ Z1 <- rnorm(N,0,s) Z3 <- rnorm(N,0,s) + Z1 U <- rnorm(N,0,s) + Z1 Z2 <- rnorm(N,0,s) + Z3 + U X <- rnorm(N,0,s) + U Y <- rnorm(N,0,s) + ce*X + 10*Z3 data.frame(Y,X,Z1,Z2,Z3) } # 1st parameter: sample size # 2nd parameter: noise standard deviation # 3rd parameter: true causal effect D <- simpson.simulator(1000,0.01,1) # true effect is 1 # unadjusted estimate m <- lm(D[,1:2]) summary(m) confint(m,'X') # adjusted for {Z1} m <- lm(D[,c(1,2,3)]) summary(m) confint(m,'X') # coeff is about 1 # adjusted for {Z1,Z2} m <- lm(D[,c(1,2,3,4)]) summary(m) confint(m,'X') # coeff is about -1 # adjusted for {Z1,Z2,Z3} m <- lm(D[,c(1,2,3,4,5)]) summary(m) confint(m,'X') # coeff is about 1 # Heteroskedasticity robust standard errors ##### #generate a random dataset set.seed(930) n <- 1000 x1 <- rnorm(n) y <- 1 + 3*x1 + x1*rnorm(n) lmod <- lm(y ~ x1) summary(lmod) # look at the residual plot, there clearly is a pattern plot(lmod,1) # Here we show how the HR standard errors can be calculated #get the estimated coefficients hatbeta <- as.numeric(lmod$coeff) #matrix X x <- cbind(rep(1,n),x1) #hat \epsilon e <- y - x %*% hatbeta #X^T X xx <- t(x)%*%x #(X^T X)^{-1} xxi <- solve(xx) #(X^T X)^{-1} X^T y xxi %*% t(x) %*% y #is the same as hatbeta #calculate the leverage leverage <- 1 - diag(diag(n) - x %*% xxi %*% t(x)) #calculation of the different standard errors # (source: modified Hansen Econx p144) p <- ncol(x) a <- n/(n-p) sig2 <- as.numeric( (t(e) %*% e)/(n-p) ) u1 <- x*(e%*%matrix(1,1,p)) u2 <- x*((e/sqrt(1-leverage))%*%matrix(1,1,p)) u3 <- x*((e/(1-leverage))%*%matrix(1,1,p)) xx <- solve(t(x)%*%x) v0 <- xx* sig2 v1 <- xx %*% (t(u1)%*%u1) %*% xx v1a <- a * xx %*% (t(u1)%*%u1) %*% xx v2 <- xx %*% (t(u2)%*%u2) %*% xx v3 <- xx %*% (t(u3)%*%u3) %*% xx # Homoskedastic formula ( s0 <- sqrt(diag(v0)) ) #HCO ( s1 <- sqrt(diag(v1)) ) #HC1 ( s1a <- sqrt(diag(v1a)) ) #HC2 ( s2 <- sqrt(diag(v2)) ) #HC3 ( s3 <- sqrt(diag(v3)) ) #Simpler way is to use coeftest from lmtest library # Homoskedastic formula coeftest(lmod) #HCO coeftest(lmod, vcov = sandwich) coeftest(lmod, vcov = vcovHC(lmod, "HC0")) #HC1 #default in Stata: "reg y x, robust" coeftest(lmod, vcov = vcovHC(lmod, "HC1")) #HC2 coeftest(lmod, vcov = vcovHC(lmod, "HC2")) #HC3 coeftest(lmod, vcov = vcovHC(lmod, "HC3")) # Diagnostics ##### # correct model x <- runif(100, 0, 10) y <- 1 + 2 * x + rnorm(100, 0, 1) m <- lm(y ~ x) par(mfrow = c(2, 2)) plot(m) # some wrong model y <- 1 + 2 * x + 1 * x^2 - 0.5 * x^3 m <- lm(y ~ x) par(mfrow = c(2, 2)) plot(m) # Model Building ##### # # Some high-school Physics first: # # [kinetic energy] = 0.5 * weight * speed^2 # [work done by braking] = mu * weight * g * distance # mu - coefficient of friction # g - standard acceleration of free fall # # [kinetic energy] = [work done by braking] # leads to # distance = (1/(2*mu*g)) * speed^2 = c * speed^2 lmod2 <- lm(dist ~ speed + I(speed^2), data=cars) summary(lmod2) lmod3 <- lm(dist ~ I(speed^2), data=cars) summary(lmod3) lmod4 <- lm(dist ~ I(speed^2) - 1, data=cars) summary(lmod4) #Be careful, removing the intercept changes the interpretation of R^2 ggplot(data=cars,aes(x=speed,y=dist)) + geom_point() + geom_smooth(method='lm', formula= y~x, se=FALSE) + geom_smooth(method='lm', formula= y~x + I(x^2), se=FALSE, col="red")+ geom_smooth(method='lm', formula= y~I(x^2), se=FALSE, col="green") + geom_smooth(method='lm', formula= y~I(x^2) - 1, se=FALSE, col="black") #Which one would you pick?? lmod5 <- lm(log(dist) ~ log(speed), data=cars) summary(lmod5) # the exponent is 1.602 not 2 though # Fit is certainly not the only criterion to pick a model ggplot(data=cars,aes(x=speed,y=dist)) + geom_point() + geom_smooth(method='lm', formula= y~x+I(x^2)+I(x^3)+I(x^4)+ I(x^5)+I(x^6)+I(x^7), se=FALSE) # do you see why? # Model Selection ##### # Freedman, David A., and David A. Freedman. "A note on screening regression equations." The American Statistician 37.2 (1983): 152-155. #set random seed, so that the results are reproducible set.seed(32413) #try different seeds, you may be more lucky #set.seed(32414) #create 51 columns of noise (iid N(0,1)) DAT <- matrix(rnorm(100*51),nrow=100,ncol=51) #save them into y and X y <- DAT[,1] X <- DAT[,-1] #run a regression summary(lm(y ~ X - 1)) #run a linear regression sum_table <- summary(lm(y ~ X - 1)) #how many variables have p-values <0.25 sum(sum_table$coeff[,4]<0.25) #how many variables have p-values <0.05 sum(sum_table$coeff[,4]<0.05) #now pick only those variables that had p<0.25 # (minus one because of the constant term) X2 <- X[,which(sum_table$coeff[,4]<0.25)] #again, run a linear regression summary(lm(y ~ X2 - 1)) #Remember, we attempted to explain noise by noise! # those numbers may look indeed quite convincing! # Lesson to take: it is risky to rely on automatic # model selection techniques. Model building requires skill # and subject matter expertise # Sample size ##### set.seed(54634) nObs <- 10000000 x <- rnorm(nObs) eps <- rnorm(nObs) y <- 1 + 0.001*x + eps summary(lm(y ~ x)) nSim <- 1000 resFew <- rep(NA,nSim) resMid <- rep(NA,nSim) resMany <- rep(NA,nSim) nObsFew <- 10000 nObsMid <- 5000 nObsMany <- 100000 for (iSim in 1:nSim){ print(iSim) x <- rnorm(nObsFew) eps <- rnorm(nObsFew) y <- 1 + 0.01*x + eps resFew[iSim] <- lm(y ~ x)$coeff[2] x <- rnorm(nObsMid) eps <- rnorm(nObsMid) y <- 1 + 0.01*x + eps resMid[iSim] <- lm(y ~ x)$coeff[2] x <- rnorm(nObsMany) eps <- rnorm(nObsMany) y <- 1 + 0.01*x + eps resMany[iSim] <- lm(y ~ x)$coeff[2] } DF <- data.frame(simNumber = c(1:nSim,1:nSim,1:nSim), coeff = c(resMany,resMid,resFew), yy=c(rep("Many",nSim),rep("Mid",nSim),rep("Few",nSim)) ) ggplot(DF, aes(x=coeff)) + geom_histogram(data=subset(DF,yy == 'Many'),fill = "blue", alpha = 0.2) + #geom_histogram(data=subset(DF,yy == 'Mid'),fill = "green", alpha = 0.2) + geom_histogram(data=subset(DF,yy == 'Few'), fill = "red", alpha = 0.2) ggplot(DF, aes(x=coeff)) + geom_density(data=subset(DF,yy == 'Many'),fill = "blue", alpha = 0.2) + #geom_density(data=subset(DF,yy == 'Mid'),fill = "green", alpha = 0.2) + geom_density(data=subset(DF,yy == 'Few'), fill = "red", alpha = 0.2) # Principal Components Analysis ##### # Example - Heptathlon data("heptathlon", package = "HSAUR") #scoring system - source wiki: https://en.wikipedia.org/wiki/Heptathlon # hurdles highjump shot run200m longjump javelin run800m coeffs = matrix(c(9.23076, 26.7, 1.835, #hurdles 1.84523, 75, 1.348, #highjump 56.0211, 1.5, 1.05, #shot put 4.99087, 42.5, 1.81, #run200m 0.188807, 210, 1.41, #long jump 15.9803, 3.8, 1.04, #javelin 0.11193, 254, 1.88 #run800m ),3,7,byrow=FALSE) P1 <- coeffs[1,1]*(coeffs[2,1] - heptathlon$hurdles)^coeffs[3,1] #hurdles P4 <- coeffs[1,4]*(coeffs[2,4] - heptathlon$run200m)^coeffs[3,4] #run200m P7 <- coeffs[1,7]*(coeffs[2,7] - heptathlon$run800m)^coeffs[3,7] #run800m P2 <- coeffs[1,2]*(heptathlon$highjump*100 - coeffs[2,2])^coeffs[3,2] #highjump P5 <- coeffs[1,5]*(heptathlon$longjump*100 - coeffs[2,5])^coeffs[3,5] #long jump P3 <- coeffs[1,3]*(heptathlon$shot - coeffs[2,3])^coeffs[3,3] #shot put P6 <- coeffs[1,6]*(heptathlon$javelin - coeffs[2,6])^coeffs[3,6] #javelin P <- P1 + P2 + P3 + P4 + P5 + P6 + P7 round(P) heptathlon$score #not quite, but perhaps it has changed since 1988? #recode the variables so that more is better heptathlon$hurdles <- max(heptathlon$hurdles) - heptathlon$hurdles heptathlon$run200m <- max(heptathlon$run200m) - heptathlon$run200m heptathlon$run800m <- max(heptathlon$run800m) - heptathlon$run800m #look at the correlation matrix score_index <- which(colnames(heptathlon) == "score") round(cor(heptathlon[,-score_index]), 2) #perform pca heptathlon_pca <- prcomp(heptathlon[, -score_index], scale = TRUE) print(heptathlon_pca) summary(heptathlon_pca) #plot the pairwise scatterplots plot(heptathlon[,-8]) #first PC for competitors? center <- heptathlon_pca$center scale <- heptathlon_pca$scale hm <- as.matrix(heptathlon[,-score_index]) drop(scale(hm, center = center, scale = scale) %*% heptathlon_pca$rotation[,1]) #or simply predict(heptathlon_pca)[,1] #how is score correlated with the first PC? cor(heptathlon$score, heptathlon_pca$x[,1]) #well ok plot(heptathlon$score, heptathlon_pca$x[,1]) #not bad #how to we interpret it? plot(heptathlon_pca) biplot(heptathlon_pca, col = c("gray", "black"))