{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandas for data analysis\n", "\n", "*\"Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.\"*\n", "\n", "It is a popular library for data analysis of \"big data\". Pandas offer an easy-to-use interface to quickly load and manipulate data in some of the most common formats (see http://pandas.pydata.org/pandas-docs/version/0.20/io.html for details).\n", "\n", "For tips and tricks on how to use Pandas for analysis of really big data (> 100MB, up to several GBs) check https://www.dataquest.io/blog/pandas-big-data/ . This page also offers information on internals of the Pandas library.\n", "\n", "More information about libraries:\n", "Pandas http://pandas.pydata.org/\n", "Numpy http://www.numpy.org/\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#IMPORTS - if you don't run Anaconda, you will need to install Pandas and Numpy module first (use PyPi)\n", "# pip install pandas\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from datetime import datetime\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dataset used from http://www.mapakriminality.cz/data contains information on criminal activities in CR between years 2013-2017" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Variables\n", "ROW_NAMES = [\"Index Kriminality\",\"Zjištěno\",\"Objasněno- Počet\",\"Objasněno- Dodatečně\", \\\n", " \"Stíháno, Vyšetřováno Osob - Mladiství 15-17 Let\",\"Stíháno, Vyšetřováno Osob - Recidivisté\", \\\n", " \"Stíháno, Vyšetřováno Osob - Celkem\",\"Stíháno, Vyšetřováno Osob - Nezletilí 1-14 Let\", \\\n", " \"Stíháno, Vyšetřováno Osob - Ženy\",\"Spácháno Skutků - Mladiství 15-17 Let\", \\\n", " \"Spácháno Skutků - Z Toho Alkohol\",\"Spácháno Skutků - Pod Vlivem\",\\\n", " \"Spácháno Skutků - Recidivisté\",\"Spácháno Skutků - Nezletilí 1-14 Let\",\\\n", " \"Škody V Tis. Kč - Zajištěno\",\"Škody V Tis. Kč - Celkem\",\"Časová Jednotka\",\\\n", " \"Kód úz.jednotky\",\"Název úz.jednotky\",\"Počet Obyv. úz. Jednotky\"]\n", "USED_ROW_NAME = 'Zjištěno'\n", "TIMESTAMP_NAME = \"Časová Jednotka\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read CSV file\n", "df = pd.read_csv('data//crimes-0100-101-903-120-167.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Show loaded data, use df.head(n = 5) for larger datasets\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Show names of columns\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Numpy arrays work the same way as Python lists\n", "\n", "# Only show data in column we want to work with (head is to show only the beginning of the array, often used for performance)\n", "df[USED_ROW_NAME].head()\n", "\n", "# Show specific value\n", "df[USED_ROW_NAME][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Show types of data we work with\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Lambda functions in python is limited to a single command (nevertheless this command can be very complex)\n", "# - you can pass multiple input parameters into the lambda function\n", "\n", "##f = lambda x,y : x+y\n", "##f(1,2)\n", "\n", "# - you can call if/else structure, but \"else' branch needs to be present\n", "\n", "##f = lambda x : x * x if x > 0 else \"IT'S A TRAP!\"\n", "##f(5)\n", "\n", "# - you CAN'T use for circle\n", "##f = lambda x,y : x*x for y in range(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Change all 0 values to NaN\n", "df = df.applymap(lambda x: np.nan if x == 0 else x)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Show information about our dataset\n", "df[USED_ROW_NAME].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Change date format so that year is first (note that data is already sorted, if it wasn't use method groupby)\n", "df[TIMESTAMP_NAME] = df[TIMESTAMP_NAME].map(lambda x: datetime.strptime(x, '%m-%Y').date())\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Index for data frame is set to be the datetime\n", "df.set_index(pd.DatetimeIndex(df[TIMESTAMP_NAME]), inplace=True)\n", "\n", "# Sum up all data for USED_ROW_NAME with same month\n", "def sum_up_months(df):\n", " column = []\n", " \n", " for current_month in range(1, 13):\n", " # Choose all lines with the same month\n", " all_lines = df[df.index.month == current_month]\n", " # Sum values in all lines picked\n", " column.append(all_lines[USED_ROW_NAME].sum())\n", " \n", " return column\n", "\n", "# Create new data frame, that contains aggregated data\n", "cols = sum_up_months(df)\n", "df_ = pd.DataFrame(index=range(12), columns=[USED_ROW_NAME])\n", "df_[USED_ROW_NAME] = cols\n", "\n", "# Check if we got all months (indexing starts with 0)\n", "df_.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Draw the graph in matplotlib\n", "%matplotlib inline \n", "x_axis = list(range(len(df[USED_ROW_NAME])))\n", "x_ticks = [str(x) for x in df[TIMESTAMP_NAME]]\n", "\n", "#Set size for inline plot\n", "plt.rcParams['figure.figsize'] = (20.0, 10.0)\n", "\n", "#Set X axis names and draw graph\n", "plt.xticks(x_axis, x_ticks)\n", "plt.bar(x_axis, df[USED_ROW_NAME], color=\"orange\", width=0.5)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Only show first 12 months\n", "plt.xticks(x_axis[:12], x_ticks[:12])\n", "plt.bar(x_axis[:12], df[USED_ROW_NAME][:12], color=\"orange\", width=0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Agregated data\n", "\n", "x_axis = list(range(len(df_[USED_ROW_NAME])))\n", "x_ticks = [str(x+1) for x in df_.index]\n", "\n", "plt.xticks(x_axis, x_ticks)\n", "plt.bar(x_axis, df_[USED_ROW_NAME], color=\"orange\", width=0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Save times and used column data to CSV file\n", "df[[TIMESTAMP_NAME,USED_ROW_NAME]].to_csv(\"cleaned_data.csv\", header=True)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }