7 роки тому · 0d9a7cf773
--- a/02_Data_structures.ipynb
+++ b/02_Data_structures.ipynb
@@ -0,0 +1,568 @@
 
																+{
															
 
																+ "cells": [
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "# Tutorial: Part 2 - Data structures"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Python lists"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "python_list = [2, 3, 5, 7, 11, 13, 17, 19]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Access by index\n",
															
 
																+    "python_list[3]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Loop\n",
															
 
																+    "for number in python_list:\n",
															
 
																+    "    print(number)"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# List operations\n",
															
 
																+    "sum(python_list)"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Python dict"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "python_dict = {'two': 2,\n",
															
 
																+    "               'three': 3,\n",
															
 
																+    "               'five': 5,\n",
															
 
																+    "               'seven': 7,\n",
															
 
																+    "               'eleven': 11,\n",
															
 
																+    "               'thirteen': 13,\n",
															
 
																+    "               'seventeen': 17,\n",
															
 
																+    "               'nineteen': 19}"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Access by label\n",
															
 
																+    "python_dict['eleven']"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Access by position\n",
															
 
																+    "python_dict[3]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Loop\n",
															
 
																+    "for name, number in python_dict.items():\n",
															
 
																+    "    print(f'{name}: {number}')"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### pandas Series"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "import pandas\n",
															
 
																+    "\n",
															
 
																+    "pandas_series = pandas.Series(python_dict, name='primes')\n",
															
 
																+    "pandas_series"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Access by label\n",
															
 
																+    "pandas_series['eleven']"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Access by position\n",
															
 
																+    "pandas_series[3]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Looping is usually a bad practice (for performance reasons)\n",
															
 
																+    "pandas_series.apply(print);"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Series operations\n",
															
 
																+    "pandas_series.sum()"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Press <TAB> after the dot to see the list\n",
															
 
																+    "pandas_series."
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Read the documentation\n",
															
 
																+    "pandas_series?"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Most operations can be applied to the Series (all elements at a time)\n",
															
 
																+    "pandas_series + 1"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "pandas_series < 10"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Slicing works as in Python lists\n",
															
 
																+    "pandas_series[2:-2]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Selection can be performed with a boolean mask\n",
															
 
																+    "pandas_series[[True, False, True, False, True, False, True, False]]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# This is mainly useful with conditions\n",
															
 
																+    "pandas_series[pandas_series < 10]"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Operating with different Series uses labels to align\n",
															
 
																+    "another_series = pandas.Series({'eleven': 1, 'three': 0})\n",
															
 
																+    "pandas_series * another_series"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Data types"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# All the elements of the Series have the same type (the internal data representation)\n",
															
 
																+    "pandas_series.dtype"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# pandas uses numpy internally, and the main types are numpy types:\n",
															
 
																+    "import numpy\n",
															
 
																+    "numpy.bool, numpy.uint8, numpy.int64, numpy.float64"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# For memory and speed smaller types (e.g. 8 bits over 64 bits are preferred), but they need to be big enough for our data\n",
															
 
																+    "pandas.Series([0, 255], dtype=numpy.uint8) + 1"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "pandas.Series([0, 255], dtype=numpy.uint8) - 1"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# By default, pandas uses the largest types, to avoid problems\n",
															
 
																+    "pandas.Series([0, 255]).dtype"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# pandas automatically change the type (upcast) when mixing data\n",
															
 
																+    "int_series = pandas.Series([1, 5, 10], dtype=numpy.uint64)\n",
															
 
																+    "int_series.iloc[1] = 5.5\n",
															
 
																+    "int_series.dtype"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "The main types:\n",
															
 
																+    "- object\n",
															
 
																+    "  - float\n",
															
 
																+    "    - int\n",
															
 
																+    "      - bool\n",
															
 
																+    "  - datetime\n",
															
 
																+    "  - category\n",
															
 
																+    "\n",
															
 
																+    "`object` is a Python object and can be anything (for example, strings are objects)\n",
															
 
																+    "\n",
															
 
																+    "When mixing data of different types, the most specific type that can represent both is used. For example:\n",
															
 
																+    "- bool + int -> int\n",
															
 
																+    "- int + float -> float\n",
															
 
																+    "- float + datetime -> object"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "bool_series = pandas.Series([True])\n",
															
 
																+    "int_series = pandas.Series([100])\n",
															
 
																+    "float_series = pandas.Series([3.141592])\n",
															
 
																+    "datetime_series = pandas.Series(pandas.Timestamp('2018-01-01'))"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "pandas.concat([bool_series, int_series])"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "pandas.concat([int_series, float_series])"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "pandas.concat([float_series, datetime_series])"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# object is the most flexible, but is very slow\n",
															
 
																+    "large_series = pandas.Series(numpy.random.randint(0, 255, 100_000_000))\n",
															
 
																+    "large_series_as_object = large_series.astype(object)"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "%timeit large_series.sum()"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "%timeit large_series_as_object.sum()"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### pandas DataFrame"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "A `DataFrame` can be seen as a collection of `Series` that share the index."
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "legs = pandas.Series({'dog': 4, 'spider': 8, 'swallow': 2})\n",
															
 
																+    "species = pandas.Series({'dog': 'mammal', 'spider': 'insect', 'swallow': 'bird'})\n",
															
 
																+    "speed = pandas.Series({'dog': 8.49, 'spider': 0.53, 'swallow': 11.0})\n",
															
 
																+    "\n",
															
 
																+    "animals = pandas.DataFrame({'legs': legs, 'species': species, 'speed': speed})\n",
															
 
																+    "animals"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Every column of a DataFrame has a type\n",
															
 
																+    "animals.dtypes"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# If we select a column, it is returned as a regular Series\n",
															
 
																+    "animals['speed']"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Same applies for a row, but types are generally upcasted\n",
															
 
																+    "animals.loc['swallow']"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# DataFrames can be transposed\n",
															
 
																+    "animals.T"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# But again, types are generally upcasted, as the types are always specified column-wise\n",
															
 
																+    "animals.T.dtypes"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# If we operate a DataFrame with a scalar, the operation is element-wise\n",
															
 
																+    "animals * 2"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "animals"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Operations between DataFrame and Series are sometimes possible\n",
															
 
																+    "animals == pandas.Series([2, 'insect', 3.141592], index=['legs', 'species', 'speed'])"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Operations also exist as methods, so the axis can be changed\n",
															
 
																+    "animals.eq(pandas.Series([4, 'insect', 2], index=['dog', 'spider', 'swallow']),\n",
															
 
																+    "           axis='rows')"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "# Operations with two DataFrames are usually possible, and they operate element-wise based on both indices\n",
															
 
																+    "animals_bad_copy = animals.copy()\n",
															
 
																+    "animals_bad_copy.loc['dog', 'legs'] = 3\n",
															
 
																+    "animals_bad_copy.loc['spider', 'speed'] = 300.\n",
															
 
																+    "\n",
															
 
																+    "animals == animals_bad_copy"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  }
															
 
																+ ],
															
 
																+ "metadata": {
															
 
																+  "kernelspec": {
															
 
																+   "display_name": "Python 3",
															
 
																+   "language": "python",
															
 
																+   "name": "python3"
															
 
																+  },
															
 
																+  "language_info": {
															
 
																+   "codemirror_mode": {
															
 
																+    "name": "ipython",
															
 
																+    "version": 3
															
 
																+   },
															
 
																+   "file_extension": ".py",
															
 
																+   "mimetype": "text/x-python",
															
 
																+   "name": "python",
															
 
																+   "nbconvert_exporter": "python",
															
 
																+   "pygments_lexer": "ipython3",
															
 
																+   "version": "3.7.0"
															
 
																+  }
															
 
																+ },
															
 
																+ "nbformat": 4,
															
 
																+ "nbformat_minor": 2
															
 
																+}