Barvy

Author

Published

March 19, 2024

# install instructions for RLing
#
# - [ ] install devtools
# - [ ] gunzip Rling_1.0.tar.gz
# - [ ] devtools::install_local("Rling_1.0.tar")
# - [ ] also works: install.packages(pkgs = file.choose(), repos = NULL, type = "source")

library(Rling)
#library(languageR)

# English 'black'

data(colreg)

colreg

       spoken fiction academic press
black   20335   41118    26892 73080
blue     4693   22093     3605 21210
brown    1185   10914     1201 11539
gray     1168   12140     1289  6559
green    3860   14398     4477 26837
orange    931    3496      474  5766
pink      962    7312      584  6356
purple    613    3366      429  3403
red      7230   25111     5621 34596
white   14474   40745    26336 54883
yellow   1349   10553     1855 10382

Stará debata

univerzální hierarchie barev: Berlin and Kay (1991)
Universality And Evolution of Basic Color Terms
WALS

Derivační morfologie?

Levshina: základní nápad – Deviation of Proportions, DP (Gries 2008; Lijijt & Gries 2012)
více lingvistický přístup?
intuice:

. . Na obzoru se černaly mraky. Na obzoru se ??oranžovělo slunce.

. . Petr nabílil košili. Petr ??naoranžověl košili.

. . Petr začernil text. Petr ??zaokroval text.

DeriNET search: [pos=“A” lemma=“bílý”] [pos=“V”]
vs. [pos=“A” lemma=“okrový”] [pos=“V”]
DeriNET – v1

Korpusy

ČNK-search
COCA-search
back to Levshina:
freqreq: sizes of 4 COCA registers

freqreg <- c(95385672, 90344134, 91044778, 187245672)

freqreg

[1]  95385672  90344134  91044778 187245672

exp_prop <- prop.table(freqreg)

exp_prop

[1] 0.2055636 0.1946987 0.1962086 0.4035291

mosaicplot(freqreg)

colreg[1,]

      spoken fiction academic press
black  20335   41118    26892 73080

black_obs <- prop.table(colreg[1,])

black_obs

         spoken   fiction  academic    press
black 0.1259718 0.2547189 0.1665913 0.452718

mosaicplot(black_obs)

DP_black <- sum(abs(black_obs - exp_prop))/2

DP_black

[1] 0.1092091

min(exp_prop)

[1] 0.1946987

DP_black_norm <- DP_black/(1 - min(exp_prop))

DP_black_norm

[1] 0.1356127

English “gray”

gray_obs <- prop.table(colreg[4,])

gray_obs

         spoken   fiction   academic     press
gray 0.05520892 0.5738325 0.06092834 0.3100303

DP_gray <- sum(abs(gray_obs - exp_prop))/2

DP_gray

[1] 0.3791338

DP_gray_norm <- DP_gray/(1 - min(exp_prop))

DP_gray_norm

[1] 0.4707974

Czech “černý”

cerny_obs <- c(363,181,175,124)

cerny_obs

[1] 363 181 175 124

freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop

[1] 0.25 0.25 0.25 0.25

cerny_obs <- prop.table(cerny_obs)

cerny_obs

[1] 0.4306050 0.2147094 0.2075919 0.1470937

DP_cerny <- sum(abs(cerny_obs - exp_cz_prop))/2

DP_cerny

[1] 0.180605

DP_cerny_norm <- DP_cerny/(1 - min(exp_cz_prop))

DP_cerny_norm

[1] 0.2408066

DP_black_norm

[1] 0.1356127

Czech “okrová”

okrovy_obs <- c(0,4,2,1)

okrovy_obs

[1] 0 4 2 1

freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop

[1] 0.25 0.25 0.25 0.25

okrovy_obs <- prop.table(okrovy_obs)

okrovy_obs

[1] 0.0000000 0.5714286 0.2857143 0.1428571

DP_okrovy <- sum(abs(okrovy_obs - exp_cz_prop))/2

DP_okrovy

[1] 0.3571429

DP_okrovy_norm <- DP_okrovy/(1 - min(exp_cz_prop))

DP_okrovy_norm

[1] 0.4761905

DP_black_norm

[1] 0.1356127

library(ggplot2)

labs <- c("academic","fiction","press","spoken")

label <- rep(c("okrovy","cerny"), each =4)

barvy <- c(okrovy_obs,cerny_obs)

df <- cbind(labs,barvy,label)
df <- as.data.frame(df) 

ggplot(data=df, aes(x=labs, y=barvy, fill=label)) +
geom_bar(stat="identity", position=position_dodge())

Czech “oranžový”

oranzovy_obs <- c(23,22,16,25)

oranzovy_obs

[1] 23 22 16 25

freq_cz_reg <- c(1000000,1000000,1000000,1000000)

exp_cz_prop <- prop.table(freq_cz_reg)

exp_cz_prop

[1] 0.25 0.25 0.25 0.25

oranzovy_obs <- prop.table(oranzovy_obs)

oranzovy_obs

[1] 0.2674419 0.2558140 0.1860465 0.2906977

DP_oranzovy <- sum(abs(oranzovy_obs - exp_cz_prop))/2

DP_oranzovy

[1] 0.06395349

DP_oranzovy_norm <- DP_oranzovy/(1 - min(exp_cz_prop))

DP_oranzovy_norm

[1] 0.08527132

DP_black_norm

[1] 0.1356127

library(ggplot2)

labs <- c("academic","fiction","press","spoken")

label <- rep(c("oranzovy","cerny"), each =4)

barvy <- c(oranzovy_obs,cerny_obs)

df <- cbind(labs,barvy,label)
df <- as.data.frame(df) 

ggplot(data=df, aes(x=labs, y=barvy, fill=label)) +
geom_bar(stat="identity", position=position_dodge())

back to English

primcol <- colreg[c(1, 2, 5, 9:11),]

primcol

       spoken fiction academic press
black   20335   41118    26892 73080
blue     4693   22093     3605 21210
green    3860   14398     4477 26837
red      7230   25111     5621 34596
white   14474   40745    26336 54883
yellow   1349   10553     1855 10382

seccol <- colreg[-c(1, 2, 5, 9:11),]

seccol

       spoken fiction academic press
brown    1185   10914     1201 11539
gray     1168   12140     1289  6559
orange    931    3496      474  5766
pink      962    7312      584  6356
purple    613    3366      429  3403

primcol_sums <- colSums(primcol)

primcol_sums

  spoken  fiction academic    press 
   51941   154018    68786   220988

seccol_sums <- colSums(seccol)

seccol_sums

  spoken  fiction academic    press 
    4859    37228     3977    33623

primcol_obs <- prop.table(primcol_sums)

primcol_obs

   spoken   fiction  academic     press 
0.1047762 0.3106874 0.1387561 0.4457803

seccol_obs <- prop.table(seccol_sums)

seccol_obs

    spoken    fiction   academic      press 
0.06097607 0.46717783 0.04990776 0.42193833

DP_primcol <- sum(abs(primcol_obs - exp_prop))/2

DP_primcol

[1] 0.1582399

DP_seccol <- sum(abs(seccol_obs - exp_prop))/2

DP_seccol

[1] 0.2908884

DP_primcol_norm <- DP_primcol/(1 - min(exp_prop))

DP_primcol_norm

[1] 0.1964978

DP_seccol_norm <- DP_seccol/(1 - min(exp_prop))

DP_seccol_norm

[1] 0.3612168

Trying some standard tools

first idea: chi-square test, baseline from freqreq
let’s check but in simple cases chisq.test should work:

x <- c(1000, 2000, 2000, 1000)
y <- c(99, 201, 201, 99)

table_test <- matrix(c(x, y), nrow = 2, byrow = TRUE)

print(table_test)

     [,1] [,2] [,3] [,4]
[1,] 1000 2000 2000 1000
[2,]   99  201  201   99

chisq.test(table_test)


    Pearson's Chi-squared test

data:  table_test
X-squared = 0.027285, df = 3, p-value = 0.9988

# more varied y

y <- c(10, 10, 201, 50)
table_test <- matrix(c(x, y), nrow = 2, byrow = TRUE)

print(table_test)

     [,1] [,2] [,3] [,4]
[1,] 1000 2000 2000 1000
[2,]   10   10  201   50

chisq.test(table_test)


    Pearson's Chi-squared test

data:  table_test
X-squared = 221.88, df = 3, p-value < 2.2e-16

now to data

table_test_prim <- matrix(c(freqreg, primcol_sums), nrow = 2, byrow = TRUE)

print(table_test_prim)

         [,1]     [,2]     [,3]      [,4]
[1,] 95385672 90344134 91044778 187245672
[2,]    51941   154018    68786    220988

chisq.test(table_test_prim)


    Pearson's Chi-squared test

data:  table_test_prim
X-squared = 69204, df = 3, p-value < 2.2e-16

# fisher's test doesn't work

table_test_sec <- matrix(c(freqreg, seccol_sums), nrow = 2, byrow = TRUE)

print(table_test_sec)

         [,1]     [,2]     [,3]      [,4]
[1,] 95385672 90344134 91044778 187245672
[2,]     4859    37228     3977     33623

chisq.test(table_test_sec)


    Pearson's Chi-squared test

data:  table_test_sec
X-squared = 47238, df = 3, p-value < 2.2e-16

# that's why they probably don't use standard tests

another possibility: proportions
slightly better at first sight but there’s a problem: chisq.test should be used for count data, not fractions
moreover the result is very tiny variation

table_test_prim <- matrix(c(prop.table(freqreg), prop.table(primcol_sums)), nrow = 2, byrow = TRUE)

print(table_test_prim)

          [,1]      [,2]      [,3]      [,4]
[1,] 0.2055636 0.1946987 0.1962086 0.4035291
[2,] 0.1047762 0.3106874 0.1387561 0.4457803

chisq.test(table_test_prim)

Warning in chisq.test(table_test_prim): Chi-squared approximation may be
incorrect


    Pearson's Chi-squared test

data:  table_test_prim
X-squared = 0.071308, df = 3, p-value = 0.995

table_test_sec <- matrix(c(prop.table(freqreg), prop.table(seccol_sums)), nrow = 2, byrow = TRUE)

print(table_test_sec)

           [,1]      [,2]       [,3]      [,4]
[1,] 0.20556360 0.1946987 0.19620863 0.4035291
[2,] 0.06097607 0.4671778 0.04990776 0.4219383

chisq.test(table_test_sec)

Warning in chisq.test(table_test_sec): Chi-squared approximation may be
incorrect


    Pearson's Chi-squared test

data:  table_test_sec
X-squared = 0.27798, df = 3, p-value = 0.9641

Domácí úloha

kombinace primárních a sekundárních barev

primarni_barvy <- c("černá", "modrá", "zelená", "červená", "bílá", "žlutá")
sekundární_barvy <- c("hnědá", "šedá", "oranžová", "růžová", "fialová")

# let's make all possible pairs of primary and secondary colors

kombinace <- expand.grid(primarni_barvy, sekundární_barvy)
print(kombinace)

      Var1     Var2
1    černá    hnědá
2    modrá    hnědá
3   zelená    hnědá
4  červená    hnědá
5     bílá    hnědá
6    žlutá    hnědá
7    černá     šedá
8    modrá     šedá
9   zelená     šedá
10 červená     šedá
11    bílá     šedá
12   žlutá     šedá
13   černá oranžová
14   modrá oranžová
15  zelená oranžová
16 červená oranžová
17    bílá oranžová
18   žlutá oranžová
19   černá   růžová
20   modrá   růžová
21  zelená   růžová
22 červená   růžová
23    bílá   růžová
24   žlutá   růžová
25   černá  fialová
26   modrá  fialová
27  zelená  fialová
28 červená  fialová
29    bílá  fialová
30   žlutá  fialová

# add name of student as third row and save it as csv

kombinace <- cbind(kombinace, "name" = "name")

write.csv(kombinace, "zadani_barev.csv")

References

Berlin, Brent, and Paul Kay. 1991. Basic Color Terms: Their Universality and Evolution. Univ of California Press.