Prechádzať zdrojové kódy

Adding tutorial on data structures

Marc Garcia 7 rokov pred
rodič
commit
0d9a7cf773
1 zmenil súbory, kde vykonal 568 pridanie a 0 odobranie
  1. 568 0
      02_Data_structures.ipynb

+ 568 - 0
02_Data_structures.ipynb

@@ -0,0 +1,568 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Part 2 - Data structures"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Python lists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "python_list = [2, 3, 5, 7, 11, 13, 17, 19]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access by index\n",
+    "python_list[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loop\n",
+    "for number in python_list:\n",
+    "    print(number)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List operations\n",
+    "sum(python_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Python dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "python_dict = {'two': 2,\n",
+    "               'three': 3,\n",
+    "               'five': 5,\n",
+    "               'seven': 7,\n",
+    "               'eleven': 11,\n",
+    "               'thirteen': 13,\n",
+    "               'seventeen': 17,\n",
+    "               'nineteen': 19}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access by label\n",
+    "python_dict['eleven']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access by position\n",
+    "python_dict[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loop\n",
+    "for name, number in python_dict.items():\n",
+    "    print(f'{name}: {number}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### pandas Series"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas\n",
+    "\n",
+    "pandas_series = pandas.Series(python_dict, name='primes')\n",
+    "pandas_series"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access by label\n",
+    "pandas_series['eleven']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access by position\n",
+    "pandas_series[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Looping is usually a bad practice (for performance reasons)\n",
+    "pandas_series.apply(print);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Series operations\n",
+    "pandas_series.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Press <TAB> after the dot to see the list\n",
+    "pandas_series."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the documentation\n",
+    "pandas_series?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Most operations can be applied to the Series (all elements at a time)\n",
+    "pandas_series + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pandas_series < 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Slicing works as in Python lists\n",
+    "pandas_series[2:-2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Selection can be performed with a boolean mask\n",
+    "pandas_series[[True, False, True, False, True, False, True, False]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is mainly useful with conditions\n",
+    "pandas_series[pandas_series < 10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Operating with different Series uses labels to align\n",
+    "another_series = pandas.Series({'eleven': 1, 'three': 0})\n",
+    "pandas_series * another_series"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All the elements of the Series have the same type (the internal data representation)\n",
+    "pandas_series.dtype"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pandas uses numpy internally, and the main types are numpy types:\n",
+    "import numpy\n",
+    "numpy.bool, numpy.uint8, numpy.int64, numpy.float64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For memory and speed smaller types (e.g. 8 bits over 64 bits are preferred), but they need to be big enough for our data\n",
+    "pandas.Series([0, 255], dtype=numpy.uint8) + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pandas.Series([0, 255], dtype=numpy.uint8) - 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# By default, pandas uses the largest types, to avoid problems\n",
+    "pandas.Series([0, 255]).dtype"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pandas automatically change the type (upcast) when mixing data\n",
+    "int_series = pandas.Series([1, 5, 10], dtype=numpy.uint64)\n",
+    "int_series.iloc[1] = 5.5\n",
+    "int_series.dtype"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The main types:\n",
+    "- object\n",
+    "  - float\n",
+    "    - int\n",
+    "      - bool\n",
+    "  - datetime\n",
+    "  - category\n",
+    "\n",
+    "`object` is a Python object and can be anything (for example, strings are objects)\n",
+    "\n",
+    "When mixing data of different types, the most specific type that can represent both is used. For example:\n",
+    "- bool + int -> int\n",
+    "- int + float -> float\n",
+    "- float + datetime -> object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bool_series = pandas.Series([True])\n",
+    "int_series = pandas.Series([100])\n",
+    "float_series = pandas.Series([3.141592])\n",
+    "datetime_series = pandas.Series(pandas.Timestamp('2018-01-01'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pandas.concat([bool_series, int_series])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pandas.concat([int_series, float_series])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pandas.concat([float_series, datetime_series])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# object is the most flexible, but is very slow\n",
+    "large_series = pandas.Series(numpy.random.randint(0, 255, 100_000_000))\n",
+    "large_series_as_object = large_series.astype(object)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit large_series.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%timeit large_series_as_object.sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### pandas DataFrame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A `DataFrame` can be seen as a collection of `Series` that share the index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "legs = pandas.Series({'dog': 4, 'spider': 8, 'swallow': 2})\n",
+    "species = pandas.Series({'dog': 'mammal', 'spider': 'insect', 'swallow': 'bird'})\n",
+    "speed = pandas.Series({'dog': 8.49, 'spider': 0.53, 'swallow': 11.0})\n",
+    "\n",
+    "animals = pandas.DataFrame({'legs': legs, 'species': species, 'speed': speed})\n",
+    "animals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Every column of a DataFrame has a type\n",
+    "animals.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If we select a column, it is returned as a regular Series\n",
+    "animals['speed']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Same applies for a row, but types are generally upcasted\n",
+    "animals.loc['swallow']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# DataFrames can be transposed\n",
+    "animals.T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# But again, types are generally upcasted, as the types are always specified column-wise\n",
+    "animals.T.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If we operate a DataFrame with a scalar, the operation is element-wise\n",
+    "animals * 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "animals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Operations between DataFrame and Series are sometimes possible\n",
+    "animals == pandas.Series([2, 'insect', 3.141592], index=['legs', 'species', 'speed'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Operations also exist as methods, so the axis can be changed\n",
+    "animals.eq(pandas.Series([4, 'insect', 2], index=['dog', 'spider', 'swallow']),\n",
+    "           axis='rows')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Operations with two DataFrames are usually possible, and they operate element-wise based on both indices\n",
+    "animals_bad_copy = animals.copy()\n",
+    "animals_bad_copy.loc['dog', 'legs'] = 3\n",
+    "animals_bad_copy.loc['spider', 'speed'] = 300.\n",
+    "\n",
+    "animals == animals_bad_copy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}