{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text Analysis and Visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import nltk\n", "nltk.download('punkt')\n", "from nltk.tokenize import word_tokenize\n", "from collections import Counter\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = None\n", "with open('../01-DH/maj.txt') as f: # modify the path if needed\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import json" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = {\"call\": \"tagger\", \n", " \"lang\": \"cs\",\n", " \"output\": \"json\",\n", " \"text\": text.replace(';', ',')\n", " }\n", "uri = \"https://nlp.fi.muni.cz/languageservices/service.py\"\n", "r = requests.post(uri, params=data)\n", "data = r.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenize\n", "Apart from previous notebooks, we also add the information about individual sentences. They allow us to group the results." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens = []\n", "sentence = 0\n", "for line in data['vertical']:\n", " new_line = line[:]\n", " if len(line)==1 and line[0].startswith(\"