library(AER)
## Loading required package: car
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(ggplot2)

Auta

Načtení data:

data("mtcars")

Vizualizace dat:

ggplot(mtcars, aes(wt, mpg, color = factor(cyl), size = hp)) +
    geom_point()

Odhad

Modely:

mod1 <- mpg ~ wt
mod2 <- update(mod1, . ~ . + hp + cyl)
mod3 <- update(mod1, . ~ . + hp * cyl)
mod4 <- update(mod1, . ~ . + hp * factor(cyl))
mod5 <- update(mod4, . ~ . - wt + log(wt))
mod5
## mpg ~ hp + factor(cyl) + log(wt) + hp:factor(cyl)

Odhad modelu:

m1 <- lm(mod1, data = mtcars)
summary(m1)
## 
## Call:
## lm(formula = mod1, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

Odhad více modelů:

models <- list(mod1, mod2, mod3, mod4, mod5)
estims <- lapply(models, function(x) lm(x, data = mtcars))

prezentace více modelů:

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(estims, type = "text")
## 
## ======================================================================================================================================
##                                                                    Dependent variable:                                                
##                     ------------------------------------------------------------------------------------------------------------------
##                                                                            mpg                                                        
##                              (1)                    (2)                    (3)                    (4)                    (5)          
## --------------------------------------------------------------------------------------------------------------------------------------
## wt                        -5.344***              -3.167***              -3.120***              -3.060***                              
##                            (0.559)                (0.741)                (0.661)                (0.683)                               
##                                                                                                                                       
## hp                                                 -0.018               -0.164***              -0.099***              -0.092***       
##                                                   (0.012)                (0.052)                (0.035)                (0.032)        
##                                                                                                                                       
## cyl                                               -0.942*               -2.742***                                                     
##                                                   (0.551)                (0.800)                                                      
##                                                                                                                                       
## hp:cyl                                                                   0.019***                                                     
##                                                                          (0.007)                                                      
##                                                                                                                                       
## factor(cyl)6                                                                                    -9.982*                 -8.102        
##                                                                                                 (5.769)                (5.381)        
##                                                                                                                                       
## factor(cyl)8                                                                                   -11.728**              -10.051**       
##                                                                                                 (4.225)                (3.970)        
##                                                                                                                                       
## log(wt)                                                                                                               -10.794***      
##                                                                                                                        (2.039)        
##                                                                                                                                       
## hp:factor(cyl)6                                                                                  0.078                  0.069         
##                                                                                                 (0.052)                (0.048)        
##                                                                                                                                       
## hp:factor(cyl)8                                                                                 0.086**                0.078**        
##                                                                                                 (0.037)                (0.034)        
##                                                                                                                                       
## Constant                  37.285***              38.752***              52.018***              41.877***              42.907***       
##                            (1.878)                (1.787)                (4.917)                (3.233)                (3.021)        
##                                                                                                                                       
## --------------------------------------------------------------------------------------------------------------------------------------
## Observations                  32                     32                     32                     32                     32          
## R2                          0.753                  0.843                  0.879                  0.883                  0.900         
## Adjusted R2                 0.745                  0.826                  0.862                  0.854                  0.876         
## Residual Std. Error    3.046 (df = 30)        2.512 (df = 28)        2.242 (df = 27)        2.300 (df = 25)        2.121 (df = 25)    
## F Statistic         91.375*** (df = 1; 30) 50.171*** (df = 3; 28) 49.251*** (df = 4; 27) 31.317*** (df = 6; 25) 37.569*** (df = 6; 25)
## ======================================================================================================================================
## Note:                                                                                                      *p<0.1; **p<0.05; ***p<0.01
library(broom)
m5 <- estims[[5]]
tidy(m5)
##              term     estimate  std.error statistic      p.value
## 1     (Intercept)  42.90666182 3.02086997 14.203412 1.787500e-13
## 2              hp  -0.09225948 0.03226566 -2.859370 8.442745e-03
## 3    factor(cyl)6  -8.10229605 5.38072716 -1.505799 1.446503e-01
## 4    factor(cyl)8 -10.05130192 3.97017464 -2.531703 1.801087e-02
## 5         log(wt) -10.79406355 2.03877336 -5.294391 1.743246e-05
## 6 hp:factor(cyl)6   0.06893945 0.04843774  1.423259 1.670218e-01
## 7 hp:factor(cyl)8   0.07812729 0.03426396  2.280160 3.139175e-02
glance(m5)
##   r.squared adj.r.squared    sigma statistic      p.value df    logLik
## 1 0.9001654     0.8762051 2.120552  37.56903 2.497248e-11  7 -65.50991
##        AIC      BIC deviance df.residual
## 1 147.0198 158.7457 112.4185          25
augment(m5)
##              .rownames  mpg  hp factor.cyl.   log.wt.  .fitted   .se.fit
## 1            Mazda RX4 21.0 110           6 0.9631743 21.84260 0.9890285
## 2        Mazda RX4 Wag 21.0 110           6 1.0560527 20.84006 0.9330202
## 3           Datsun 710 22.8  93           4 0.8415672 25.24260 0.7219892
## 4       Hornet 4 Drive 21.4 110           6 1.1678274 19.63356 0.9143613
## 5    Hornet Sportabout 18.7 175           8 1.2354715 17.04647 0.7436189
## 6              Valiant 18.1 105           6 1.2412686 18.95743 1.0260913
## 7           Duster 360 14.3 245           8 1.2725656 15.65682 0.7295476
## 8            Merc 240D 24.4  62           4 1.1600209 24.66523 1.2301590
## 9             Merc 230 22.8  95           4 1.1474025 21.75688 1.0028817
## 10            Merc 280 19.2 123           6 1.2354715 18.60024 0.8301657
## 11           Merc 280C 17.8 123           6 1.2354715 18.60024 0.8301657
## 12          Merc 450SE 16.4 180           8 1.4036430 15.16055 0.6628058
## 13          Merc 450SL 17.3 180           8 1.3164082 16.10217 0.6686205
## 14         Merc 450SLC 15.2 180           8 1.3297240 15.95844 0.6646656
## 15  Cadillac Fleetwood 10.4 205           8 1.6582281 12.05924 0.8164395
## 16 Lincoln Continental 10.4 215           8 1.6908336 11.56598 0.8664144
## 17   Chrysler Imperial 14.7 230           8 1.6761615 11.51236 0.8747715
## 18            Fiat 128 32.4  66           4 0.7884574 28.30688 0.8334929
## 19         Honda Civic 30.4  52           4 0.4793350 32.93520 1.2864065
## 20      Toyota Corolla 33.9  65           4 0.6070445 30.35732 0.9119299
## 21       Toyota Corona 21.5  97           4 0.9021918 24.21918 0.8029580
## 22    Dodge Challenger 15.5 150           8 1.2584610 17.15162 0.9165122
## 23         AMC Javelin 15.2 150           8 1.2340169 17.41547 0.9301737
## 24          Camaro Z28 13.3 245           8 1.3454724 14.86986 0.7031688
## 25    Pontiac Firebird 19.2 175           8 1.3467736 15.84507 0.6923763
## 26           Fiat X1-9 27.3  66           4 0.6601073 29.69230 0.8603249
## 27       Porsche 914-2 26.0  91           4 0.7608058 26.29886 0.7017866
## 28        Lotus Europa 30.4 113           4 0.4140944 28.01158 1.4721561
## 29      Ford Pantera L 15.8 264           8 1.1537316 16.67101 0.9581067
## 30        Ferrari Dino 19.7 175           6 1.0188473 19.72586 2.0459805
## 31       Maserati Bora 15.0 335           8 1.2725656 14.38492 1.5711799
## 32          Volvo 142E 21.4 109           4 1.0224509 21.81398 1.1169443
##         .resid       .hat   .sigma      .cooksd  .std.resid
## 1  -0.84259798 0.21753037 2.155527 0.0080136432 -0.44919801
## 2   0.15993690 0.19359067 2.163974 0.0002419218  0.08398893
## 3  -2.44260086 0.11592141 2.098312 0.0281119652 -1.22506244
## 4   1.76643994 0.18592510 2.127063 0.0278106612  0.92324688
## 5   1.65353020 0.12297113 2.134059 0.0138868553  0.83263822
## 6  -0.85743088 0.23413931 2.155019 0.0093234637 -0.46203554
## 7  -1.35682045 0.11836125 2.144085 0.0089058886 -0.68144091
## 8  -0.26523481 0.33653082 2.163258 0.0017086324 -0.15355759
## 9   1.04312340 0.22366691 2.150745 0.0128286572  0.55829379
## 10  0.59975515 0.15326107 2.160186 0.0024427840  0.30736215
## 11 -0.80024485 0.15326107 2.156986 0.0043489344 -0.41010898
## 12  1.23944529 0.09769557 2.147828 0.0058563589  0.61532134
## 13  1.19782769 0.09941723 2.148888 0.0055873688  0.59522846
## 14 -0.75844098 0.09824459 2.158130 0.0022078979 -0.37664185
## 15 -1.65924256 0.14823487 2.132939 0.0178704165 -0.84781502
## 16 -1.16597514 0.16693741 2.148513 0.0103891802 -0.60242312
## 17  3.18763708 0.17017335 2.043014 0.0797735287  1.65016107
## 18  4.09312246 0.15449205 1.964302 0.1150229480  2.09916727
## 19 -2.53519708 0.36800928 2.064064 0.1881330460 -1.50386043
## 20  3.54268084 0.18493761 2.010599 0.1109971604  1.85049433
## 21 -2.71917694 0.14337981 2.079533 0.0458976636 -1.38546153
## 22 -1.65162412 0.18680082 2.131744 0.0244800116 -0.86370196
## 23 -2.21547490 0.19241122 2.104962 0.0460031727 -1.16258001
## 24 -1.56986013 0.10995665 2.137459 0.0108673927 -0.78470521
## 25  3.35493249 0.10660723 2.039401 0.0477610271  1.67383917
## 26 -2.39229597 0.16459903 2.097298 0.0428816439 -1.23429389
## 27 -0.29886303 0.10952478 2.163313 0.0003919380 -0.14935248
## 28  2.38842062 0.48195894 2.055548 0.3254674606  1.56487581
## 29 -0.87101075 0.20414093 2.155083 0.0077679973 -0.46042255
## 30 -0.02585828 0.93090487 2.164186 0.0041420382 -0.04639029
## 31  0.61507628 0.54897701 2.156189 0.0324353831  0.43189761
## 32 -0.41397864 0.27743762 2.161995 0.0028931757 -0.22966320

Diagnostika

modelu m5

rozdělení náhodné složky (imho zbytečné):

qqPlot(m5)

linearita modelu:

raintest(m5)
## 
##  Rainbow test
## 
## data:  m5
## Rain = 0.51395, df1 = 16, df2 = 9, p-value = 0.8824

heteroskedasticita:

bptest(m5)
## 
##  studentized Breusch-Pagan test
## 
## data:  m5
## BP = 9.6483, df = 6, p-value = 0.1403
ncvTest(m5)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2.943605    Df = 1     p = 0.08621809

autokorelace (tady zrovna nemá smysl):

durbinWatsonTest(m5)
##  lag Autocorrelation D-W Statistic p-value
##    1      -0.2029913      2.398143    0.36
##  Alternative hypothesis: rho != 0

multikolinearita (problém, pokud VIF > 4):

vif(m5)
##                       GVIF Df GVIF^(1/(2*Df))
## hp               33.738297  1        5.808468
## factor(cyl)     612.000904  2        4.973797
## log(wt)           2.888584  1        1.699583
## hp:factor(cyl) 2129.740990  2        6.793314

Robustní směrodatné odchylky

coeftest(m5, vcov. = vcovHC)  # vcovHAC pro heteroskedasticitu a autokorelaci (mnoho variant)
## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)      42.906662   4.910080  8.7385 4.528e-09 ***
## hp               -0.092259   0.053273 -1.7318 0.0956261 .  
## factor(cyl)6     -8.102296   5.244885 -1.5448 0.1349605    
## factor(cyl)8    -10.051302   5.599233 -1.7951 0.0847346 .  
## log(wt)         -10.794064   2.685770 -4.0190 0.0004719 ***
## hp:factor(cyl)6   0.068939   0.055434  1.2436 0.2251696    
## hp:factor(cyl)8   0.078127   0.054332  1.4379 0.1628496    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Predikce

in-sample:

pred <- predict(m5)
ggplot(data.frame(skutecne = mtcars$mpg, predpoved = pred), aes(skutecne, predpoved)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1) +
    geom_smooth()
## `geom_smooth()` using method = 'loess'

out-of-sample:

set.seed(12345)
idx <- sample(nrow(mtcars), round(nrow(mtcars) / 2))
estim_sample <- mtcars[idx, ]
test_sample <- mtcars[-idx, ]
m6 <- lm(mod5, estim_sample)
predex <- predict(m6, newdata = test_sample)
ggplot(data.frame(skutecne = test_sample$mpg, predpoved = predex), aes(skutecne, predpoved)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1) +
    geom_smooth()
## `geom_smooth()` using method = 'loess'

Kosatce

data:

data(iris)

vizualizace:

ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + geom_smooth(method = "lm")

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() + geom_smooth(method = "lm")

Odhad modelů

iris1 <- Sepal.Width ~ Sepal.Length
est_iris1 <- lm(iris1, iris)
est_iris2 <- update(est_iris1, . ~ . * Species)
stargazer(est_iris1, est_iris2, type = "text")
## 
## ==========================================================================
##                                            Dependent variable:            
##                                -------------------------------------------
##                                                Sepal.Width                
##                                        (1)                   (2)          
## --------------------------------------------------------------------------
## Sepal.Length                         -0.062               0.799***        
##                                      (0.043)               (0.110)        
##                                                                           
## Speciesversicolor                                          1.442**        
##                                                            (0.713)        
##                                                                           
## Speciesvirginica                                          2.016***        
##                                                            (0.686)        
##                                                                           
## Sepal.Length:Speciesversicolor                            -0.479***       
##                                                            (0.134)        
##                                                                           
## Sepal.Length:Speciesvirginica                             -0.567***       
##                                                            (0.126)        
##                                                                           
## Constant                            3.419***               -0.569         
##                                      (0.254)               (0.554)        
##                                                                           
## --------------------------------------------------------------------------
## Observations                           150                   150          
## R2                                    0.014                 0.623         
## Adjusted R2                           0.007                 0.610         
## Residual Std. Error             0.434 (df = 148)      0.272 (df = 144)    
## F Statistic                    2.074 (df = 1; 148) 47.534*** (df = 5; 144)
## ==========================================================================
## Note:                                          *p<0.1; **p<0.05; ***p<0.01

Diagnostika

jako výše

Predikce

jako výše

Testování hypotéz

hypotézy o parametrech:

linearHypothesis(est_iris2, "Sepal.Length:Speciesversicolor = Sepal.Length:Speciesvirginica")
## Linear hypothesis test
## 
## Hypothesis:
## Sepal.Length:Speciesversicolor - Sepal.Length:Speciesvirginica = 0
## 
## Model 1: restricted model
## Model 2: Sepal.Width ~ Sepal.Length + Species + Sepal.Length:Species
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    145 10.741                           
## 2    144 10.680  1  0.060706 0.8185 0.3671

hypotézy o nested modelech:

anova(est_iris1, est_iris2)
## Analysis of Variance Table
## 
## Model 1: Sepal.Width ~ Sepal.Length
## Model 2: Sepal.Width ~ Sepal.Length + Species + Sepal.Length:Species
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    148 27.916                                  
## 2    144 10.680  4    17.236 58.098 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Další nápady na diagnostiku

http://www.statmethods.net/stats/rdiagnostics.html