{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Analysis and Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "from nltk.tokenize import word_tokenize\n",
    "from collections import Counter\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = None\n",
    "with open('../01-DH/maj.txt') as f:  # modify the path if needed\n",
    "    text = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\"call\": \"tagger\", \n",
    "        \"lang\": \"cs\",\n",
    "        \"output\": \"json\",\n",
    "        \"text\": text.replace(';', ',')\n",
    "       }\n",
    "uri = \"https://nlp.fi.muni.cz/languageservices/service.py\"\n",
    "r = requests.post(uri, params=data)\n",
    "data = r.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize\n",
    "Apart from previous notebooks, we also add the information about individual sentences. They allow us to group the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = []\n",
    "sentence = 0\n",
    "for line in data['vertical']:\n",
    "    new_line = line[:]\n",
    "    if len(line)==1 and line[0].startswith(\"<s\"):\n",
    "        sentence += 1\n",
    "    if len(line)==3:\n",
    "        new_line.append(sentence)\n",
    "        tokens.append(new_line)\n",
    "df = pd.DataFrame.from_dict({\"word\": [word for word, lemma, tag, sentence in tokens], \n",
    "                              \"lemma\": [lemma for word, lemma, tag, sentence in tokens], \n",
    "                              \"tag\": [tag for word, lemma, tag, sentence in tokens],\n",
    "                              \"sentence\": [sentence for word, lemma, tag, sentence in tokens]\n",
    "                               })\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = [tag[0:2] for tag in df[\"tag\"]]\n",
    "df[\"pos\"] = pos\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[0], type(df.iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(df.pos), len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract Nouns\n",
    "We extract only rows (using `loc`) with nouns. Afterwards, we use the shallow ontology **Sholva** to categorize the nouns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nouns = df.loc[df.pos==\"k1\"]\n",
    "nouns.insert(5,'substance','')\n",
    "nouns.insert(5,'person','')\n",
    "nouns.insert(5,'event','')\n",
    "nouns.insert(5,'person-individual','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, noun in nouns.iterrows():\n",
    "    r = requests.get('https://nlp.fi.muni.cz/languageservices/websholva/index.cgi?word='+noun.lemma)\n",
    "    response = r.json()\n",
    "    semantic_classes = response.get('classes')\n",
    "    for semantic_class in semantic_classes:\n",
    "        nouns.at[i, semantic_class.get('class','')]=semantic_class.get('belongs')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We added four more columns, according to **Sholva** class names."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nouns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grouping and Frequencies\n",
    "We group the nouns by sentences they belong to. We create a new DataFrame with the total count of each classification for each sentence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group=nouns.groupby(by=nouns.sentence)\n",
    "event_group = []\n",
    "person_group = []\n",
    "substance_group = []\n",
    "person_individual_group = []\n",
    "sentence_group = []\n",
    "for g in group:\n",
    "    sentence_group.append(g[1].sentence.iloc[0])\n",
    "    event_group.append(g[1].event.loc[g[1].event=='+'].count())\n",
    "    person_group.append(g[1].person.loc[g[1].person=='+'].count())\n",
    "    substance_group.append(g[1].substance.loc[g[1].substance=='+'].count())\n",
    "    person_individual_group.append(g[1]['person-individual'].loc[g[1]['person-individual']=='+'].count())\n",
    "semantic_groups = pd.DataFrame(data={\"sentence\": sentence_group,\n",
    "                                     \"event\":event_group,\n",
    "                                     \"person\":person_group, \n",
    "                                     \"substance\":substance_group, \n",
    "                                     \"person-individual\":person_individual_group})\n",
    "semantic_groups"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization\n",
    "We would like to see the frequencies of the semantic classes over sentences. We do not use `pyplot.hist` here, since we already calculated the frequencies. We therefore draw simple bars."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.array(semantic_groups.sentence)\n",
    "p = semantic_groups.person\n",
    "e = semantic_groups.event\n",
    "s = semantic_groups.substance\n",
    "i = semantic_groups['person-individual']\n",
    "\n",
    "ax = plt.subplot(111)\n",
    "ax.bar(x-0.2, p, width=0.2, color='b', align='center')\n",
    "ax.bar(x, e, width=0.2, color='g', align='center')\n",
    "ax.bar(x+0.2, s, width=0.2, color='r', align='center')\n",
    "ax.bar(x+0.4, i, width=0.2, color='y', align='center')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nouns.loc[nouns.substance=='+']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}