Hate Crimes
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("ggplot2", "dplyr")
ipak(packages)
Hate_Crimes <- read.csv("hate_crimes.csv")
Hate_Crimes_selected <- read.csv("hate_crimes.csv") %>%
select(c(share_voters_voted_trump, share_white_poverty, share_non_citizen))
\(Y' = a + bX\)
\(Y' = b_{0} + b_{1}X_{1}\)\(Y' = b_{0} + b_{n}X_{n}\)
\(Y' = b_{0} + b_{1}X_{1} + b_{2}X_{2} + ... + b_{n}X_{n}+ e\)Přímka (model) je proložena daty tak, aby jim co nejlépe odpovídala.
\(S_{T}\)\(^{2}\) = \(S_{M}\)\(^{2}\) + \(S_{R}\)\(^{2}\)
\(R^{2}\) = \(SS_{M}\)\(^{2}\) + \(SS_{T}\)\(^{2}\)
\(SS_{M}\)
\(SS_{R}\)
\(SS_{T}\)
\(R^{2}\)
Metoda nejmenších čtverců graficky
ModelHateCrime <- lm(formula = share_voters_voted_trump ~ share_white_poverty +
share_non_citizen, data = Hate_Crimes_selected)
# Generic functions (summary) change their behaviour based on an object's class.
summary(ModelHateCrime)
Call:
lm(formula = share_voters_voted_trump ~ share_white_poverty +
share_non_citizen, data = Hate_Crimes_selected)
Residuals:
Min 1Q Median 3Q Max
-0.25728 -0.03434 -0.00480 0.03619 0.16636
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.40711 0.06397 6.364 8.99e-08 ***
share_white_poverty 1.98561 0.55229 3.595 0.000801 ***
share_non_citizen -1.83930 0.42746 -4.303 8.98e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08387 on 45 degrees of freedom
(3 observations deleted due to missingness)
Multiple R-squared: 0.5326, Adjusted R-squared: 0.5119
F-statistic: 25.64 on 2 and 45 DF, p-value: 3.69e-08
anova(ModelHateCrime)
Analysis of Variance Table
Response: share_voters_voted_trump
Df Sum Sq Mean Sq F value Pr(>F)
share_white_poverty 1 0.23054 0.230537 32.773 7.987e-07 ***
share_non_citizen 1 0.13024 0.130238 18.514 8.976e-05 ***
Residuals 45 0.31655 0.007034
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
predict(ModelHateCrime)
1 2 3 4 5 6 7 8 9 10
0.6085999 0.4526775 0.4018880 0.5718139 0.3467091 0.4357477 0.4158916 0.4739967 0.2842146 0.4599931
11 12 13 14 15 16 17 18 19 21
0.4386739 0.3989617 0.5519579 0.4173547 0.5902069 0.5306387 0.5519579 0.6894873 0.6085999 0.3791057
22 23 24 26 27 28 29 30 31 32
0.4004248 0.5122457 0.4144285 0.5093195 0.5872807 0.4541406 0.3820319 0.4710705 0.3437828 0.4953159
33 34 35 36 37 38 39 40 41 43
0.4217440 0.5137088 0.5306387 0.5504948 0.5321018 0.4769229 0.5306387 0.4188178 0.5306387 0.5916700
44 45 46 47 48 49 50 51
0.3636389 0.4923897 0.5872807 0.4357477 0.4386739 0.6667050 0.5306387 0.5490317
\(b_{i}\)
\(β_{i}\)
\(b_{0}\)
QuantPsyc::lm.beta(ModelHateCrime)
share_white_poverty share_non_citizen
0.3978439 -0.4761472
“To draw conclusions about a population based on a regression analysis done on a sample, several assumptions must be true.” (Field, 2009 , s. 220)
Povaha proměnných - spojité, kvantitativní a kardinální nebo dummy (jen v případě prediktorů).
Nenulová variabilita prediktorů (tj. nejde o konstantu).
Absence (dokonalé) multikolinearity - prediktory by spolu neměly vysoce korelovat.
Prediktory nekorelují s vnějšími proměnnými - absence třetí (intervenující, vnější) proměnné.
Hate_Crimes_Select <- Hate_Crimes%>%
select(c("share_voters_voted_trump", "share_white_poverty", "share_non_citizen"))
lapply(Hate_Crimes_Select, class)
$share_voters_voted_trump
[1] "numeric"
$share_white_poverty
[1] "numeric"
$share_non_citizen
[1] "numeric"
summary(Hate_Crimes_Select)
share_voters_voted_trump share_white_poverty share_non_citizen
Min. :0.040 Min. :0.04000 Min. :0.01000
1st Qu.:0.415 1st Qu.:0.07500 1st Qu.:0.03000
Median :0.490 Median :0.09000 Median :0.04500
Mean :0.490 Mean :0.09176 Mean :0.05458
3rd Qu.:0.575 3rd Qu.:0.10000 3rd Qu.:0.08000
Max. :0.700 Max. :0.17000 Max. :0.13000
NA's :3
psych::describe(Hate_Crimes_Select) # install.packages("psych")
car::vif(ModelHateCrime) # variance inflation factors, install.packages("car")
share_white_poverty share_non_citizen
1.179078 1.179078
sqrt(car::vif(ModelHateCrime)) > 2 # problem?
share_white_poverty share_non_citizen
FALSE FALSE
Rcmdr::rcorr.adjust(Hate_Crimes_Select) # install.packages("Rcmdr"), library(Rcmdr)
Error: 'rcorr.adjust' is not an exported object from 'namespace:Rcmdr'
Homoskedascita - rozptyl reziduí by měl být konstantní napříč různými úrovněmi prediktoru.
Nezávislost reziduí - Reziduální hodnoty kterýchkoliv dvou případů by spolu neměly souviset.
Normálně rozložená rezidua - jejich rozložení by mělo být náhodné.
Nezávislost kterýchkoliv dvou hodnot závislé proměnné (každá hodnota v rámci ní pochází z unikátního zdroje).
Linearita - přímka jako vhodný model popisu dat.
# non-constant error variance test
car::ncvTest(ModelHateCrime)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 3.022336, Df = 1, p = 0.082125
# plot studentized residuals vs. fitted values
car::spreadLevelPlot(ModelHateCrime)
Suggested power transformation: 2.358004
# Test for Autocorrelated Errors
car::durbinWatsonTest(ModelHateCrime)
lag Autocorrelation D-W Statistic p-value
1 -0.003840009 1.934234 0.786
Alternative hypothesis: rho != 0
# distribution of studentized residuals
# library(MASS)
sresid <- MASS::studres(ModelHateCrime)
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
# component + residual plot
car::crPlots(ModelHateCrime)
Nemají některé případy příliš velký vliv na výsledky regrese?
# Bonferonni p-value for most extreme observations
car::outlierTest(ModelHateCrime)
#qq plot for studentized resid
car::qqPlot(ModelHateCrime, main="QQ Plot")
[1] 9 46
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(Hate_Crimes)-length(ModelHateCrime$coefficients)-2))
plot(ModelHateCrime, which=4, cook.levels=cutoff)
# leverage plots
car::leveragePlots(ModelHateCrime)
Dummy proměnné - kategorické proměnné upravené tak, aby mohly vstoupit do (vícenásobné) lineární regrese
Indikátorové kódování (Indicator coding)
Efektové kódování (Effect coding)
\(Y = b_{0} + b_{A1}X_{A1} + b_{A2}X_{A2} + ... + b_{m}X_{m}+ e\)
Po dosazení do regresní rovnice predikujeme případu průměr jeho skupiny (pokud nejsou žádné další prediktory).
Hate_Crimes$UrbanRural = cut(Hate_Crimes$share_population_in_metro_areas,
breaks=c(-Inf,0.5, Inf),
labels=c("Rural","Urban"))
# The factor function
class(Hate_Crimes$UrbanRural)
[1] "factor"
contrasts(Hate_Crimes$UrbanRural)
Urban
Rural 0
Urban 1
summary(lm(share_voters_voted_trump ~ share_white_poverty +
share_non_citizen + UrbanRural, data = Hate_Crimes))
Call:
lm(formula = share_voters_voted_trump ~ share_white_poverty +
share_non_citizen + UrbanRural, data = Hate_Crimes)
Residuals:
Min 1Q Median 3Q Max
-0.253819 -0.035001 -0.005224 0.035566 0.166835
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.404332 0.071068 5.689 9.64e-07 ***
share_white_poverty 1.980436 0.561149 3.529 0.000990 ***
share_non_citizen -1.855612 0.465441 -3.987 0.000249 ***
UrbanRuralUrban 0.004516 0.047773 0.095 0.925123
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08481 on 44 degrees of freedom
(3 observations deleted due to missingness)
Multiple R-squared: 0.5327, Adjusted R-squared: 0.5009
F-statistic: 16.72 on 3 and 44 DF, p-value: 2.151e-07
A mediation analysis is typically conducted to better understand an observed effect of an IV on a DV or a correlation between \(X\) and \(Y\).
If \(X\) and \(Y\) are correlated BECAUSE of the mediator \(M\), then (\(X\) -> \(M\) -> \(Y\))
\(Y = B_{0} + B_1M + e\)
&
\(M = B_0 + B_1X + e\)
&
\(Y = B_0 + B_1M + B_2X + e\)
What will happen to the predictive value of \(X\)?
A mediator variable (M) accounts for some or all of the relationship between X and Y:
model_yx <- lm(Hate_Crimes$share_voters_voted_trump ~ Hate_Crimes$median_household_income)
model_mx <- lm(Hate_Crimes$share_population_in_metro_areas ~ Hate_Crimes$median_household_income)
model_yxm <- lm(Hate_Crimes$share_voters_voted_trump ~ Hate_Crimes$median_household_income + Hate_Crimes$share_population_in_metro_areas)
summary(model_yx)
Call:
lm(formula = Hate_Crimes$share_voters_voted_trump ~ Hate_Crimes$median_household_income)
Residuals:
Min 1Q Median 3Q Max
-0.34945 -0.06275 0.00718 0.04909 0.21359
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.154e-01 8.265e-02 11.075 6.09e-15 ***
Hate_Crimes$median_household_income -7.703e-06 1.477e-06 -5.216 3.67e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09615 on 49 degrees of freedom
Multiple R-squared: 0.357, Adjusted R-squared: 0.3439
F-statistic: 27.21 on 1 and 49 DF, p-value: 3.669e-06
summary(model_mx)
Call:
lm(formula = Hate_Crimes$share_population_in_metro_areas ~ Hate_Crimes$median_household_income)
Residuals:
Min 1Q Median 3Q Max
-0.44283 -0.11326 0.03132 0.13582 0.26112
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.382e-01 1.511e-01 2.901 0.00556 **
Hate_Crimes$median_household_income 5.649e-06 2.699e-06 2.093 0.04154 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1757 on 49 degrees of freedom
Multiple R-squared: 0.08207, Adjusted R-squared: 0.06334
F-statistic: 4.381 on 1 and 49 DF, p-value: 0.04154
summary(model_yxm)
Call:
lm(formula = Hate_Crimes$share_voters_voted_trump ~ Hate_Crimes$median_household_income +
Hate_Crimes$share_population_in_metro_areas)
Residuals:
Min 1Q Median 3Q Max
-0.29936 -0.02858 0.01007 0.05338 0.11238
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.040e+00 7.721e-02 13.470 < 2e-16 ***
Hate_Crimes$median_household_income -6.096e-06 1.330e-06 -4.583 3.29e-05 ***
Hate_Crimes$share_population_in_metro_areas -2.845e-01 6.745e-02 -4.218 0.000109 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08298 on 48 degrees of freedom
Multiple R-squared: 0.5309, Adjusted R-squared: 0.5114
F-statistic: 27.16 on 2 and 48 DF, p-value: 1.288e-08
# install.packages("multilevel")
# Compare the previous results to the output of the sobel function
model_all <- multilevel::sobel(Hate_Crimes$share_population_in_metro_areas,
Hate_Crimes$median_household_income,
Hate_Crimes$share_voters_voted_trump)
# Print out model_all
model_all
$`Mod1: Y~X`
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.7698849 0.05916087 13.013413 1.614296e-17
pred -0.3730823 0.07668876 -4.864888 1.227635e-05
$`Mod2: Y~X+M`
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.040071e+00 7.721107e-02 13.470490 6.321824e-18
pred -2.845277e-01 6.745246e-02 -4.218195 1.085169e-04
med -6.095571e-06 1.330134e-06 -4.582673 3.285506e-05
$`Mod3: M~X`
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44324.99 5354.411 8.278219 7.196527e-11
pred 14527.70 6940.789 2.093090 4.154055e-02
$Indirect.Effect
[1] -0.08855461
$SE
[1] 0.04651217
$z.value
[1] -1.903902
$N
[1] 51
Pokud jsou oboje \(X\) a \(Z\) spojité (resp. intervalové úrovně měření):
\(Y = B_0 + B_1X + B_2Z + B_3(X\)*\(Z) + e\)
\(Y = B_0 + B_1(X_1) + B_2(X_2) + B_3Z + B_4(X_1\)\(Z) + B_5(X_2\)\(Z) + e\)
mod <- Hate_Crimes %>%
select(share_voters_voted_trump, median_household_income, UrbanRural)
psych::describeBy(mod, mod$UrbanRural)
Descriptive statistics by group
group: Rural
---------------------------------------------------------------------------------
group: Urban
boxplot(formula = mod$share_voters_voted_trump ~ mod$UrbanRural,
main = "Boxplot",
xlab = "Group UrbanRural",
ylab = "share_voters_voted_trump")
# Make the subset for the group UrbanRural = "Rural"
mod_Rural <- subset(mod, mod$UrbanRural == "Rural")
# Make the subset for the group UrbanRural = "Urban"
mod_Urban <- subset(mod, mod$UrbanRural == "Urban")
cor(mod_Rural$share_voters_voted_trump, mod_Rural$median_household_income)
[1] -0.1942672
cor(mod_Urban$share_voters_voted_trump, mod_Urban$median_household_income)
[1] -0.6440223
model_1 <- lm(mod$share_voters_voted_trump ~ mod$median_household_income + mod$UrbanRural)
summary(model_1)
Call:
lm(formula = mod$share_voters_voted_trump ~ mod$median_household_income +
mod$UrbanRural)
Residuals:
Min 1Q Median 3Q Max
-0.34577 -0.05937 0.00564 0.05396 0.15812
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.609e-01 8.848e-02 10.860 1.59e-14 ***
mod$median_household_income -7.524e-06 1.470e-06 -5.119 5.37e-06 ***
mod$UrbanRuralUrban -6.141e-02 4.507e-02 -1.363 0.179
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09532 on 48 degrees of freedom
Multiple R-squared: 0.381, Adjusted R-squared: 0.3552
F-statistic: 14.77 on 2 and 48 DF, p-value: 1.002e-05
moderator <- mod$median_household_income * as.numeric(mod$UrbanRural)
model_2 <- lm(mod$share_voters_voted_trump ~ mod$median_household_income + mod$UrbanRural + moderator)
summary(model_2)
Call:
lm(formula = mod$share_voters_voted_trump ~ mod$median_household_income +
mod$UrbanRural + moderator)
Residuals:
Min 1Q Median 3Q Max
-0.33854 -0.05012 0.00546 0.05554 0.14622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.023e-01 2.440e-01 2.878 0.006 **
mod$median_household_income 2.844e-06 9.242e-06 0.308 0.760
mod$UrbanRuralUrban 2.285e-01 2.591e-01 0.882 0.382
moderator -5.467e-06 4.811e-06 -1.136 0.262
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09504 on 47 degrees of freedom
Multiple R-squared: 0.3975, Adjusted R-squared: 0.3591
F-statistic: 10.34 on 3 and 47 DF, p-value: 2.431e-05
anova(model_1, model_2)
Analysis of Variance Table
Model 1: mod$share_voters_voted_trump ~ mod$median_household_income +
mod$UrbanRural
Model 2: mod$share_voters_voted_trump ~ mod$median_household_income +
mod$UrbanRural + moderator
Res.Df RSS Df Sum of Sq F Pr(>F)
1 48 0.43616
2 47 0.42450 1 0.011661 1.2911 0.2616
color <- c("red","green","blue")
ggplot(mod, aes(x = median_household_income, y = share_voters_voted_trump)) +
geom_smooth(method = "lm", color = "black") +
geom_point(aes(color = UrbanRural))
ggplot(mod, aes(x = median_household_income, y = share_voters_voted_trump)) +
geom_smooth(aes(group = UrbanRural), method = "lm", se = T, color = "black", fullrange = T) +
geom_point(aes(color = UrbanRural))
A moderator has influence over other effects or relationships, whereas the mediator explains a relationship.
Field, A. (2009). Discovering statistics using SPSS, 3th Ed. Los Angeles: Sage.
Fox, J. (2016). Applied Regression Analysis and Generalized Linear Models, 3th Ed. Los Angeles: Sage.
Robotková, A., & Ježek, S. (2012). Vícenásobná lineární regrese. Prezentace ke kurzu PSY252.
A work by Vit Gabrhel