# doesn't work
# library(remotes)
# install_github("levshina/Rling/Rling_1.0.tar.gz")
# library(devtools)
#
# this works:
# install.packages("Rling_1.0.tar.gz", repos = NULL, type = "source")
#This is a file with executable R code for chapter 3 of Natalia Levshina’s (2015) #How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. ####################################################################################
###Section 3.1
- first: install the RLing package
##Main text
#install.packages("modeest")
library(Rling); library(modeest)
data(ldt)
head(ldt)
Length Freq Mean_RT
marveled 8 131 819.19
persuaders 10 82 977.63
midmost 7 0 908.22
crutch 6 592 766.30
resuspension 12 2 1125.42
efflorescent 12 9 948.33
mean(ldt$Length)
[1] 8.23
sort(ldt$Length)
[1] 3 3 4 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6
[26] 6 6 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8
[51] 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10
[76] 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 12 12 12 13 14 14 15
median(ldt$Length)
[1] 8
quantile(ldt$Length, 0.25)
25%
6
quantile(ldt$Length, 0.5)
50%
8
table(ldt$Length)
3 4 5 6 7 8 9 10 11 12 13 14 15
2 5 7 13 12 16 11 16 11 3 1 2 1
mlv(ldt$Length)
[1] 8 10
summary(ldt$Length)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.00 6.00 8.00 8.23 10.00 15.00
max(ldt$Length)
[1] 15
min(ldt$Length)
[1] 3
range(ldt$Length)
[1] 3 15
var(ldt$Length)
[1] 6.259697
sqrt(var(ldt$Length))
[1] 2.501939
sd(ldt$Length)
[1] 2.501939
IQR(ldt$Length)
[1] 4
mad(ldt$Length, constant = 1)
[1] 2
##Boxes with additional information
attach(ldt)
mean(Length)
[1] 8.23
detach(ldt)
produces error: mean(Length)
###Section 3.2
##Main text
#install.packages("ggplot2")
library(Rling); library(ggplot2)
data(ldt)
hist(ldt$Mean_RT, main = "Histogram of mean reaction times", xlab = "reaction times, ms")
plot(density(ldt$Mean_RT), main = "Density plot of mean reaction times", xlab = "reaction times, ms")
qqnorm(ldt$Mean_RT)
qqline(ldt$Mean_RT)
summary(ldt$Mean_RT)
Min. 1st Qu. Median Mean 3rd Qu. Max.
564.2 713.1 784.9 808.3 905.2 1458.8
shapiro.test(ldt$Mean_RT)
Shapiro-Wilk normality test
data: ldt$Mean_RT
W = 0.92006, p-value = 1.418e-05
boxplot(ldt$Mean_RT, main = "Mean reaction times", ylab = "reaction time in ms")
$Mean_RT > 1200, ] ldt[ldt
Length Freq Mean_RT
dessertspoon 12 11 1314.33
acquisitiveness 15 4 1216.81
diacritical 11 162 1458.75
boxplot.stats(ldt$Mean_RT)$out
[1] 1314.33 1216.81 1458.75
normalize(ldt$Mean_RT)
[1] 0.07138544 1.10554648 0.65249726 -0.27383532 2.07019341 0.91430110
[7] -1.08731355 0.05669936 0.80203418 -1.00605058 -1.46497423 -0.92080605
[13] 1.60905053 -0.46292675 0.80144674 -0.39262938 0.27033285 0.69348774
[19] -1.22360036 0.66881513 1.11324851 0.41399534 0.91286512 0.87859761
[25] -1.12066727 -1.38991205 -0.75586507 -0.37180779 3.30323659 -0.13343641
[31] -0.85853708 -0.29700669 -0.34387160 -0.10315045 -1.01832161 -0.31939480
[37] 0.07621553 1.23413126 0.06002821 -0.52160579 -0.36449738 0.15858811
[43] -0.61559670 -0.26006304 0.05493703 -0.39021434 1.34091537 -1.59310211
[49] -0.25451497 0.50903059 0.26595967 -0.06457502 0.84818111 0.67553809
[55] -0.26228227 -0.54934616 0.70301737 0.19011423 0.21491738 1.83247475
[61] 0.68715641 -0.85076977 -1.39748353 -0.80103292 -0.43694870 -0.59627634
[67] 0.62802047 -0.37859602 0.33965115 2.66670930 -1.14331646 -0.12606073
[73] -0.56324898 -0.30797230 -1.13926963 -0.48635920 -0.17090223 1.12264760
[79] 0.02700085 0.75379857 0.49114621 -0.90736013 0.64707973 -0.53687931
[85] 0.04090367 -0.29178497 -0.94293308 0.41967396 -1.04599671 -0.82975236
[91] -1.09129511 -0.21319813 -1.34565799 -1.51216549 -0.10001742 0.44584782
[97] -0.63720155 0.43742780 -1.18234880 4.24588704
abs(normalize(ldt$Mean_RT)) >= 2.5,] ldt[
Length Freq Mean_RT
dessertspoon 12 11 1314.33
acquisitiveness 15 4 1216.81
diacritical 11 162 1458.75
abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5,] ldt[
Length Freq Mean_RT
dessertspoon 12 11 1314.33
acquisitiveness 15 4 1216.81
diacritical 11 162 1458.75
<- which(abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5)
outliers
outliers
[1] 29 70 100
<- ldt[-outliers,]
ldt_remove
dim(ldt_remove)
[1] 97 3
mean(ldt$Mean_RT) + 2*sd(ldt$Mean_RT)
[1] 1114.666
<- ldt
ldt_new
3] <- 1114.666
ldt_new[outliers,
ldt_new[outliers,]
Length Freq Mean_RT
dessertspoon 12 11 1114.666
acquisitiveness 15 4 1114.666
diacritical 11 162 1114.666
##Boxes with additional information
<- 1 + 3.32*log10(length(ldt$Mean_RT))
nbins
nbins
[1] 7.64
<- diff(range(ldt$Mean_RT))/nbins
binsize
ggplot(ldt, aes(x = Mean_RT)) + geom_histogram(binwidth = binsize, fill = "white", colour = "black")
ggplot(ldt, aes(x = Mean_RT)) + geom_line(stat = "density")
ggplot(ldt, aes(sample = Mean_RT)) + stat_qq()
options(scipen = 999)
ggplot(ldt, aes(x = 1, y = Mean_RT)) + geom_boxplot() + theme(axis.title.x = element_blank()) + scale_x_continuous(breaks = NULL)
###Section 3.3
##Main text
# install.packages("ggplot2") #if you have not done so yet
library(Rling); library(ggplot2)
data(ldt)
summary(ldt$Freq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 53.5 310.5 3350.3 2103.2 75075.0
plot(sort(ldt$Freq, decreasing = TRUE), type = "b", main = "Zipf?s law", ylab = "Word frequency")
par(mfrow = c(1, 3))
hist(ldt$Freq, main = "Histogram of word frequencies", xlab = "Word frequency in a corpus", ylab = "Relative frequency in the sample")
plot(density(ldt$Freq), main = "Density plot of word frequencies",xlab = "Word frequency in a corpus")
qqnorm(ldt$Freq)
qqline(ldt$Freq)
log(0)
[1] -Inf
hist(log1p(ldt$Freq), main = "Histogram of log-frequencies", xlab = "Log-transformed word frequency", ylab = "Relative frequency in the sample")
plot(density(log1p(ldt$Freq)), main = "Density plot of log-frequencies", xlab = "Log-transformed word frequency in a corpus")
qqnorm(log1p(ldt$Freq))
qqline(log1p(ldt$Freq))
shapiro.test(log1p(ldt$Freq))
Shapiro-Wilk normality test
data: log1p(ldt$Freq)
W = 0.98138, p-value = 0.1699
Boxes with additional information
ggplot(ldt, aes(x = 1:nrow(ldt), y = sort(Freq, decreasing = TRUE))) + geom_line() + geom_point() + xlab("Index") + ylab("Frequency")