#This is a file with executable R code for chapter 3 of Natalia Levshina’s (2015) #How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. ####################################################################################

###Section 3.1

  • first: install the RLing package
# doesn't work
# library(remotes)
# install_github("levshina/Rling/Rling_1.0.tar.gz")
# library(devtools)
# 
# this works:
# install.packages("Rling_1.0.tar.gz", repos = NULL, type = "source")

##Main text

#install.packages("modeest")


library(Rling); library(modeest)

data(ldt)

head(ldt)
             Length Freq Mean_RT
marveled          8  131  819.19
persuaders       10   82  977.63
midmost           7    0  908.22
crutch            6  592  766.30
resuspension     12    2 1125.42
efflorescent     12    9  948.33
mean(ldt$Length)
[1] 8.23
sort(ldt$Length)
  [1]  3  3  4  4  4  4  4  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  6
 [26]  6  6  7  7  7  7  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8
 [51]  8  8  8  8  8  9  9  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10
 [76] 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 12 12 12 13 14 14 15
median(ldt$Length)
[1] 8
quantile(ldt$Length, 0.25)
25% 
  6 
quantile(ldt$Length, 0.5)
50% 
  8 
table(ldt$Length)

 3  4  5  6  7  8  9 10 11 12 13 14 15 
 2  5  7 13 12 16 11 16 11  3  1  2  1 
mlv(ldt$Length)
[1]  8 10
summary(ldt$Length)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   3.00    6.00    8.00    8.23   10.00   15.00 
max(ldt$Length)
[1] 15
min(ldt$Length)
[1] 3
range(ldt$Length)
[1]  3 15
var(ldt$Length)
[1] 6.259697
sqrt(var(ldt$Length))
[1] 2.501939
sd(ldt$Length)
[1] 2.501939
IQR(ldt$Length)
[1] 4
mad(ldt$Length, constant = 1)
[1] 2

##Boxes with additional information

attach(ldt)

mean(Length)
[1] 8.23
detach(ldt)

produces error: mean(Length)

###Section 3.2

##Main text

#install.packages("ggplot2")


library(Rling); library(ggplot2)

data(ldt)

hist(ldt$Mean_RT, main = "Histogram of mean reaction times", xlab = "reaction times, ms")

plot(density(ldt$Mean_RT), main = "Density plot of mean reaction times", xlab = "reaction times, ms")

qqnorm(ldt$Mean_RT)

qqline(ldt$Mean_RT)

summary(ldt$Mean_RT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  564.2   713.1   784.9   808.3   905.2  1458.8 
shapiro.test(ldt$Mean_RT)

    Shapiro-Wilk normality test

data:  ldt$Mean_RT
W = 0.92006, p-value = 1.418e-05
boxplot(ldt$Mean_RT, main = "Mean reaction times", ylab = "reaction time in ms")

ldt[ldt$Mean_RT > 1200, ]
                Length Freq Mean_RT
dessertspoon        12   11 1314.33
acquisitiveness     15    4 1216.81
diacritical         11  162 1458.75
boxplot.stats(ldt$Mean_RT)$out
[1] 1314.33 1216.81 1458.75
normalize(ldt$Mean_RT)
  [1]  0.07138544  1.10554648  0.65249726 -0.27383532  2.07019341  0.91430110
  [7] -1.08731355  0.05669936  0.80203418 -1.00605058 -1.46497423 -0.92080605
 [13]  1.60905053 -0.46292675  0.80144674 -0.39262938  0.27033285  0.69348774
 [19] -1.22360036  0.66881513  1.11324851  0.41399534  0.91286512  0.87859761
 [25] -1.12066727 -1.38991205 -0.75586507 -0.37180779  3.30323659 -0.13343641
 [31] -0.85853708 -0.29700669 -0.34387160 -0.10315045 -1.01832161 -0.31939480
 [37]  0.07621553  1.23413126  0.06002821 -0.52160579 -0.36449738  0.15858811
 [43] -0.61559670 -0.26006304  0.05493703 -0.39021434  1.34091537 -1.59310211
 [49] -0.25451497  0.50903059  0.26595967 -0.06457502  0.84818111  0.67553809
 [55] -0.26228227 -0.54934616  0.70301737  0.19011423  0.21491738  1.83247475
 [61]  0.68715641 -0.85076977 -1.39748353 -0.80103292 -0.43694870 -0.59627634
 [67]  0.62802047 -0.37859602  0.33965115  2.66670930 -1.14331646 -0.12606073
 [73] -0.56324898 -0.30797230 -1.13926963 -0.48635920 -0.17090223  1.12264760
 [79]  0.02700085  0.75379857  0.49114621 -0.90736013  0.64707973 -0.53687931
 [85]  0.04090367 -0.29178497 -0.94293308  0.41967396 -1.04599671 -0.82975236
 [91] -1.09129511 -0.21319813 -1.34565799 -1.51216549 -0.10001742  0.44584782
 [97] -0.63720155  0.43742780 -1.18234880  4.24588704
ldt[abs(normalize(ldt$Mean_RT)) >= 2.5,]
                Length Freq Mean_RT
dessertspoon        12   11 1314.33
acquisitiveness     15    4 1216.81
diacritical         11  162 1458.75
ldt[abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5,]
                Length Freq Mean_RT
dessertspoon        12   11 1314.33
acquisitiveness     15    4 1216.81
diacritical         11  162 1458.75
outliers <- which(abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5)

outliers
[1]  29  70 100
ldt_remove <- ldt[-outliers,]

dim(ldt_remove)
[1] 97  3
mean(ldt$Mean_RT) + 2*sd(ldt$Mean_RT)
[1] 1114.666
ldt_new <- ldt

ldt_new[outliers, 3] <- 1114.666

ldt_new[outliers,]
                Length Freq  Mean_RT
dessertspoon        12   11 1114.666
acquisitiveness     15    4 1114.666
diacritical         11  162 1114.666

##Boxes with additional information

nbins <- 1 + 3.32*log10(length(ldt$Mean_RT))

nbins
[1] 7.64
binsize <- diff(range(ldt$Mean_RT))/nbins

ggplot(ldt, aes(x = Mean_RT)) + geom_histogram(binwidth = binsize, fill = "white", colour = "black")

ggplot(ldt, aes(x = Mean_RT)) + geom_line(stat = "density")

ggplot(ldt, aes(sample = Mean_RT)) + stat_qq()

options(scipen = 999)

ggplot(ldt, aes(x = 1, y = Mean_RT)) + geom_boxplot() + theme(axis.title.x = element_blank()) + scale_x_continuous(breaks = NULL)

###Section 3.3

##Main text

# install.packages("ggplot2") #if you have not done so yet

library(Rling); library(ggplot2)

data(ldt)

summary(ldt$Freq)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    53.5   310.5  3350.3  2103.2 75075.0 
plot(sort(ldt$Freq, decreasing = TRUE), type = "b", main = "Zipf?s law", ylab = "Word frequency")

par(mfrow = c(1, 3))

hist(ldt$Freq, main = "Histogram of word frequencies", xlab = "Word frequency in a corpus", ylab = "Relative frequency in the sample")

plot(density(ldt$Freq), main = "Density plot of word frequencies",xlab = "Word frequency in a corpus")

qqnorm(ldt$Freq)

qqline(ldt$Freq)

log(0)
[1] -Inf
hist(log1p(ldt$Freq), main = "Histogram of log-frequencies", xlab = "Log-transformed word frequency", ylab = "Relative frequency in the sample")

plot(density(log1p(ldt$Freq)), main = "Density plot of log-frequencies", xlab = "Log-transformed word frequency in a corpus")

qqnorm(log1p(ldt$Freq))

qqline(log1p(ldt$Freq))

shapiro.test(log1p(ldt$Freq))

    Shapiro-Wilk normality test

data:  log1p(ldt$Freq)
W = 0.98138, p-value = 0.1699

Boxes with additional information

ggplot(ldt, aes(x = 1:nrow(ldt), y = sort(Freq, decreasing = TRUE))) + geom_line() + geom_point() + xlab("Index") + ylab("Frequency")