\usepackage{linguex}

Barvy

Author

MD

Published

March 19, 2024

# install instructions for RLing
#
# - [ ] install devtools
# - [ ] gunzip Rling_1.0.tar.gz
# - [ ] devtools::install_local("Rling_1.0.tar")
# - [ ] also works: install.packages(pkgs = file.choose(), repos = NULL, type = "source")

library(Rling)
#library(languageR)

# English 'black'

data(colreg)

colreg
       spoken fiction academic press
black   20335   41118    26892 73080
blue     4693   22093     3605 21210
brown    1185   10914     1201 11539
gray     1168   12140     1289  6559
green    3860   14398     4477 26837
orange    931    3496      474  5766
pink      962    7312      584  6356
purple    613    3366      429  3403
red      7230   25111     5621 34596
white   14474   40745    26336 54883
yellow   1349   10553     1855 10382

Stará debata

Derivační morfologie?

  • Levshina: základní nápad – Deviation of Proportions, DP (Gries 2008; Lijijt & Gries 2012)
  • více lingvistický přístup?
  • intuice:

. . Na obzoru se černaly mraky. Na obzoru se ??oranžovělo slunce.

. . Petr nabílil košili. Petr ??naoranžověl košili.

. . Petr začernil text. Petr ??zaokroval text.

  • DeriNET search: [pos=“A” lemma=“bílý”] [pos=“V”]
  • vs. [pos=“A” lemma=“okrový”] [pos=“V”]
  • DeriNET – v1

Korpusy

freqreg <- c(95385672, 90344134, 91044778, 187245672)

freqreg
[1]  95385672  90344134  91044778 187245672
exp_prop <- prop.table(freqreg)

exp_prop
[1] 0.2055636 0.1946987 0.1962086 0.4035291
mosaicplot(freqreg)

colreg[1,]
      spoken fiction academic press
black  20335   41118    26892 73080
black_obs <- prop.table(colreg[1,])

black_obs
         spoken   fiction  academic    press
black 0.1259718 0.2547189 0.1665913 0.452718
mosaicplot(black_obs)

DP_black <- sum(abs(black_obs - exp_prop))/2

DP_black
[1] 0.1092091
min(exp_prop)
[1] 0.1946987
DP_black_norm <- DP_black/(1 - min(exp_prop))

DP_black_norm
[1] 0.1356127

English “gray”

gray_obs <- prop.table(colreg[4,])

gray_obs
         spoken   fiction   academic     press
gray 0.05520892 0.5738325 0.06092834 0.3100303
DP_gray <- sum(abs(gray_obs - exp_prop))/2

DP_gray
[1] 0.3791338
DP_gray_norm <- DP_gray/(1 - min(exp_prop))

DP_gray_norm
[1] 0.4707974

Czech “černý”

cerny_obs <- c(363,181,175,124)

cerny_obs
[1] 363 181 175 124
freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop
[1] 0.25 0.25 0.25 0.25
cerny_obs <- prop.table(cerny_obs)

cerny_obs 
[1] 0.4306050 0.2147094 0.2075919 0.1470937
DP_cerny <- sum(abs(cerny_obs - exp_cz_prop))/2

DP_cerny
[1] 0.180605
DP_cerny_norm <- DP_cerny/(1 - min(exp_cz_prop))

DP_cerny_norm
[1] 0.2408066
DP_black_norm
[1] 0.1356127

Czech “okrová”

okrovy_obs <- c(0,4,2,1)

okrovy_obs
[1] 0 4 2 1
freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop
[1] 0.25 0.25 0.25 0.25
okrovy_obs <- prop.table(okrovy_obs)

okrovy_obs 
[1] 0.0000000 0.5714286 0.2857143 0.1428571
DP_okrovy <- sum(abs(okrovy_obs - exp_cz_prop))/2

DP_okrovy
[1] 0.3571429
DP_okrovy_norm <- DP_okrovy/(1 - min(exp_cz_prop))

DP_okrovy_norm
[1] 0.4761905
DP_black_norm
[1] 0.1356127
library(ggplot2)

labs <- c("academic","fiction","press","spoken")

label <- rep(c("okrovy","cerny"), each =4)

barvy <- c(okrovy_obs,cerny_obs)

df <- cbind(labs,barvy,label)
df <- as.data.frame(df) 

ggplot(data=df, aes(x=labs, y=barvy, fill=label)) +
geom_bar(stat="identity", position=position_dodge())

Czech “oranžový”

oranzovy_obs <- c(23,22,16,25)

oranzovy_obs
[1] 23 22 16 25
freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop
[1] 0.25 0.25 0.25 0.25
oranzovy_obs <- prop.table(oranzovy_obs)

oranzovy_obs 
[1] 0.2674419 0.2558140 0.1860465 0.2906977
DP_oranzovy <- sum(abs(oranzovy_obs - exp_cz_prop))/2

DP_oranzovy
[1] 0.06395349
DP_oranzovy_norm <- DP_oranzovy/(1 - min(exp_cz_prop))

DP_oranzovy_norm
[1] 0.08527132
DP_black_norm
[1] 0.1356127
library(ggplot2)

labs <- c("academic","fiction","press","spoken")

label <- rep(c("oranzovy","cerny"), each =4)

barvy <- c(oranzovy_obs,cerny_obs)

df <- cbind(labs,barvy,label)
df <- as.data.frame(df) 

ggplot(data=df, aes(x=labs, y=barvy, fill=label)) +
geom_bar(stat="identity", position=position_dodge())

back to English

primcol <- colreg[c(1, 2, 5, 9:11),]

primcol
       spoken fiction academic press
black   20335   41118    26892 73080
blue     4693   22093     3605 21210
green    3860   14398     4477 26837
red      7230   25111     5621 34596
white   14474   40745    26336 54883
yellow   1349   10553     1855 10382
seccol <- colreg[-c(1, 2, 5, 9:11),]

seccol
       spoken fiction academic press
brown    1185   10914     1201 11539
gray     1168   12140     1289  6559
orange    931    3496      474  5766
pink      962    7312      584  6356
purple    613    3366      429  3403
primcol_sums <- colSums(primcol)

primcol_sums
  spoken  fiction academic    press 
   51941   154018    68786   220988 
seccol_sums <- colSums(seccol)

seccol_sums
  spoken  fiction academic    press 
    4859    37228     3977    33623 
primcol_obs <- prop.table(primcol_sums)

primcol_obs
   spoken   fiction  academic     press 
0.1047762 0.3106874 0.1387561 0.4457803 
seccol_obs <- prop.table(seccol_sums)

seccol_obs
    spoken    fiction   academic      press 
0.06097607 0.46717783 0.04990776 0.42193833 
DP_primcol <- sum(abs(primcol_obs - exp_prop))/2

DP_primcol
[1] 0.1582399
DP_seccol <- sum(abs(seccol_obs - exp_prop))/2

DP_seccol
[1] 0.2908884
DP_primcol_norm <- DP_primcol/(1 - min(exp_prop))

DP_primcol_norm
[1] 0.1964978
DP_seccol_norm <- DP_seccol/(1 - min(exp_prop))

DP_seccol_norm
[1] 0.3612168

Trying some standard tools

  • first idea: chi-square test, baseline from freqreq

  • let’s check but in simple cases chisq.test should work:

x <- c(1000, 2000, 2000, 1000)
y <- c(99, 201, 201, 99)

table_test <- matrix(c(x, y), nrow = 2, byrow = TRUE)

print(table_test)
     [,1] [,2] [,3] [,4]
[1,] 1000 2000 2000 1000
[2,]   99  201  201   99
chisq.test(table_test)

    Pearson's Chi-squared test

data:  table_test
X-squared = 0.027285, df = 3, p-value = 0.9988
# more varied y

y <- c(10, 10, 201, 50)
table_test <- matrix(c(x, y), nrow = 2, byrow = TRUE)

print(table_test)
     [,1] [,2] [,3] [,4]
[1,] 1000 2000 2000 1000
[2,]   10   10  201   50
chisq.test(table_test)

    Pearson's Chi-squared test

data:  table_test
X-squared = 221.88, df = 3, p-value < 2.2e-16
  • now to data
table_test_prim <- matrix(c(freqreg, primcol_sums), nrow = 2, byrow = TRUE)

print(table_test_prim)
         [,1]     [,2]     [,3]      [,4]
[1,] 95385672 90344134 91044778 187245672
[2,]    51941   154018    68786    220988
chisq.test(table_test_prim)

    Pearson's Chi-squared test

data:  table_test_prim
X-squared = 69204, df = 3, p-value < 2.2e-16
# fisher's test doesn't work

table_test_sec <- matrix(c(freqreg, seccol_sums), nrow = 2, byrow = TRUE)

print(table_test_sec)
         [,1]     [,2]     [,3]      [,4]
[1,] 95385672 90344134 91044778 187245672
[2,]     4859    37228     3977     33623
chisq.test(table_test_sec)

    Pearson's Chi-squared test

data:  table_test_sec
X-squared = 47238, df = 3, p-value < 2.2e-16
# that's why they probably don't use standard tests
  • another possibility: proportions
  • slightly better at first sight but there’s a problem: chisq.test should be used for count data, not fractions
  • moreover the result is very tiny variation
table_test_prim <- matrix(c(prop.table(freqreg), prop.table(primcol_sums)), nrow = 2, byrow = TRUE)

print(table_test_prim)
          [,1]      [,2]      [,3]      [,4]
[1,] 0.2055636 0.1946987 0.1962086 0.4035291
[2,] 0.1047762 0.3106874 0.1387561 0.4457803
chisq.test(table_test_prim)
Warning in chisq.test(table_test_prim): Chi-squared approximation may be
incorrect

    Pearson's Chi-squared test

data:  table_test_prim
X-squared = 0.071308, df = 3, p-value = 0.995
table_test_sec <- matrix(c(prop.table(freqreg), prop.table(seccol_sums)), nrow = 2, byrow = TRUE)

print(table_test_sec)
           [,1]      [,2]       [,3]      [,4]
[1,] 0.20556360 0.1946987 0.19620863 0.4035291
[2,] 0.06097607 0.4671778 0.04990776 0.4219383
chisq.test(table_test_sec)
Warning in chisq.test(table_test_sec): Chi-squared approximation may be
incorrect

    Pearson's Chi-squared test

data:  table_test_sec
X-squared = 0.27798, df = 3, p-value = 0.9641

Domácí úloha

  • kombinace primárních a sekundárních barev
primarni_barvy <- c("černá", "modrá", "zelená", "červená", "bílá", "žlutá")
sekundární_barvy <- c("hnědá", "šedá", "oranžová", "růžová", "fialová")

# let's make all possible pairs of primary and secondary colors

kombinace <- expand.grid(primarni_barvy, sekundární_barvy)
print(kombinace)
      Var1     Var2
1    černá    hnědá
2    modrá    hnědá
3   zelená    hnědá
4  červená    hnědá
5     bílá    hnědá
6    žlutá    hnědá
7    černá     šedá
8    modrá     šedá
9   zelená     šedá
10 červená     šedá
11    bílá     šedá
12   žlutá     šedá
13   černá oranžová
14   modrá oranžová
15  zelená oranžová
16 červená oranžová
17    bílá oranžová
18   žlutá oranžová
19   černá   růžová
20   modrá   růžová
21  zelená   růžová
22 červená   růžová
23    bílá   růžová
24   žlutá   růžová
25   černá  fialová
26   modrá  fialová
27  zelená  fialová
28 červená  fialová
29    bílá  fialová
30   žlutá  fialová
# add name of student as third row and save it as csv

kombinace <- cbind(kombinace, "name" = "name")

write.csv(kombinace, "zadani_barev.csv")

References

Berlin, Brent, and Paul Kay. 1991. Basic Color Terms: Their Universality and Evolution. Univ of California Press.