# Text Analysis and Visualization

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [None]:
text = None
with open('../01-DH/maj.txt') as f:  # modify the path if needed
    text = f.read()

In [None]:
import requests
import json

In [None]:
data = {"call": "tagger", 
        "lang": "cs",
        "output": "json",
        "text": text.replace(';', ',')
       }
uri = "https://nlp.fi.muni.cz/languageservices/service.py"
r = requests.post(uri, params=data)
data = r.json()

In [None]:
data

## Tokenize
Apart from previous notebooks, we also add the information about individual sentences. They allow us to group the results.

In [None]:
tokens = []
sentence = 0
for line in data['vertical']:
    new_line = line[:]
    if len(line)==1 and line[0].startswith("<s"):
        sentence += 1
    if len(line)==3:
        new_line.append(sentence)
        tokens.append(new_line)
df = pd.DataFrame.from_dict({"word": [word for word, lemma, tag, sentence in tokens], 
                              "lemma": [lemma for word, lemma, tag, sentence in tokens], 
                              "tag": [tag for word, lemma, tag, sentence in tokens],
                              "sentence": [sentence for word, lemma, tag, sentence in tokens]
                               })
df

In [None]:
pos = [tag[0:2] for tag in df["tag"]]
df["pos"] = pos
df

In [None]:
df.iloc[0], type(df.iloc[0])

In [None]:
type(df.pos), len(df)

## Extract Nouns
We extract only rows (using `loc`) with nouns. Afterwards, we use the shallow ontology **Sholva** to categorize the nouns.

In [None]:
nouns = df.loc[df.pos=="k1"]
nouns.insert(5,'substance','')
nouns.insert(5,'person','')
nouns.insert(5,'event','')
nouns.insert(5,'person-individual','')

In [None]:
for i, noun in nouns.iterrows():
    r = requests.get('https://nlp.fi.muni.cz/languageservices/websholva/index.cgi?word='+noun.lemma)
    response = r.json()
    semantic_classes = response.get('classes')
    for semantic_class in semantic_classes:
        nouns.at[i, semantic_class.get('class','')]=semantic_class.get('belongs')

We added four more columns, according to **Sholva** class names.

In [None]:
nouns

## Grouping and Frequencies
We group the nouns by sentences they belong to. We create a new DataFrame with the total count of each classification for each sentence.

In [None]:
group=nouns.groupby(by=nouns.sentence)
event_group = []
person_group = []
substance_group = []
person_individual_group = []
sentence_group = []
for g in group:
    sentence_group.append(g[1].sentence.iloc[0])
    event_group.append(g[1].event.loc[g[1].event=='+'].count())
    person_group.append(g[1].person.loc[g[1].person=='+'].count())
    substance_group.append(g[1].substance.loc[g[1].substance=='+'].count())
    person_individual_group.append(g[1]['person-individual'].loc[g[1]['person-individual']=='+'].count())
semantic_groups = pd.DataFrame(data={"sentence": sentence_group,
                                     "event":event_group,
                                     "person":person_group, 
                                     "substance":substance_group, 
                                     "person-individual":person_individual_group})
semantic_groups

## Visualization
We would like to see the frequencies of the semantic classes over sentences. We do not use `pyplot.hist` here, since we already calculated the frequencies. We therefore draw simple bars.

In [None]:
x = np.array(semantic_groups.sentence)
p = semantic_groups.person
e = semantic_groups.event
s = semantic_groups.substance
i = semantic_groups['person-individual']

ax = plt.subplot(111)
ax.bar(x-0.2, p, width=0.2, color='b', align='center')
ax.bar(x, e, width=0.2, color='g', align='center')
ax.bar(x+0.2, s, width=0.2, color='r', align='center')
ax.bar(x+0.4, i, width=0.2, color='y', align='center')

plt.show()

In [None]:
nouns.loc[nouns.substance=='+']