{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tutorial: Part 2 - Data structures" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Python lists" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python_list = [2, 3, 5, 7, 11, 13, 17, 19]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by index\n", "python_list[3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by slice\n", "python_list[2:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Loop\n", "for number in python_list:\n", " print(number)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# List operations\n", "sum(python_list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Python dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "python_dict = {'two': 2,\n", " 'three': 3,\n", " 'five': 5,\n", " 'seven': 7,\n", " 'eleven': 11,\n", " 'thirteen': 13,\n", " 'seventeen': 17,\n", " 'nineteen': 19}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by label\n", "python_dict['eleven']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by position\n", "python_dict[3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Loop\n", "for name, number in python_dict.items():\n", " print(f'{name}: {number}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pandas Series" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas\n", "\n", "pandas_series = pandas.Series(python_dict, name='primes')\n", "pandas_series" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by label\n", "pandas_series['eleven']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Access by position\n", "pandas_series[3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Looping is usually a bad practice (for performance reasons)\n", "pandas_series.apply(print);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Series operations\n", "pandas_series.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Press after the dot to see the list\n", "pandas_series." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read the documentation\n", "pandas_series?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Most operations can be applied to the Series (all elements at a time)\n", "pandas_series + 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_series < 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Slicing works as in Python lists\n", "pandas_series[2:-2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Selection can be performed with a boolean mask\n", "pandas_series[[True, False, True, False, True, False, True, False]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This is mainly useful with conditions\n", "pandas_series[pandas_series < 10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Operating with different Series uses labels to align\n", "another_series = pandas.Series({'eleven': 1, 'three': 0})\n", "pandas_series * another_series" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

EXERCISE 1: Fibonacci numbers

\n", "

Tasks:\n", "

\n", "

\n", "

Hints:\n", "

\n", "

\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load solutions/data_structures_1.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data types" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# All the elements of the Series have the same type (the internal data representation)\n", "pandas_series.dtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# pandas uses numpy internally, and the main types are numpy types:\n", "import numpy\n", "numpy.bool, numpy.uint8, numpy.int64, numpy.float64" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For memory and speed smaller types (e.g. 8 bits over 64 bits are preferred), but they need to be big enough for our data\n", "pandas.Series([0, 255], dtype=numpy.uint8) + 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.Series([0, 255], dtype=numpy.uint8) - 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# By default, pandas uses the largest types, to avoid problems\n", "pandas.Series([0, 255]).dtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# pandas automatically change the type (upcast) when mixing data\n", "int_series = pandas.Series([1, 5, 10], dtype=numpy.uint64)\n", "int_series.iloc[1] = 5.5\n", "int_series.dtype" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The main types:\n", "- object\n", " - float\n", " - int\n", " - bool\n", " - datetime\n", " - category\n", "\n", "`object` is a Python object and can be anything (for example, strings are objects)\n", "\n", "When mixing data of different types, the most specific type that can represent both is used. For example:\n", "- bool + int -> int\n", "- int + float -> float\n", "- float + datetime -> object" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bool_series = pandas.Series([True])\n", "int_series = pandas.Series([100])\n", "float_series = pandas.Series([3.141592])\n", "datetime_series = pandas.Series(pandas.Timestamp('2018-01-01'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.concat([bool_series, int_series])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.concat([int_series, float_series])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.concat([float_series, datetime_series])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# object is the most flexible, but is very slow\n", "large_series = pandas.Series(numpy.random.randint(0, 255, 100_000_000))\n", "large_series_as_object = large_series.astype(object)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit large_series.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%timeit large_series_as_object.sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

EXERCISE 2: Performance of int vs float

\n", "

Tasks:\n", "

\n", "

\n", "

Hints:\n", "

\n", "

\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load solutions/data_structures_1.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pandas DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A `DataFrame` can be seen as a collection of `Series` that share the index." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "legs = pandas.Series({'dog': 4, 'spider': 8, 'swallow': 2})\n", "species = pandas.Series({'dog': 'mammal', 'spider': 'insect', 'swallow': 'bird'})\n", "speed = pandas.Series({'dog': 8.49, 'spider': 0.53, 'swallow': 11.0})\n", "\n", "animals = pandas.DataFrame({'legs': legs, 'species': species, 'speed': speed})\n", "animals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Every column of a DataFrame has a type\n", "animals.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If we select a column, it is returned as a regular Series\n", "animals['speed']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Same applies for a row, but types are generally upcasted\n", "animals.loc['swallow']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# DataFrames can be transposed\n", "animals.T" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# But again, types are generally upcasted, as the types are always specified column-wise\n", "animals.T.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If we operate a DataFrame with a scalar, the operation is element-wise\n", "animals * 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "animals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Operations between DataFrame and Series are sometimes possible\n", "animals == pandas.Series([2, 'insect', 3.141592], index=['legs', 'species', 'speed'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Operations also exist as methods, so the axis can be changed\n", "animals.eq(pandas.Series([4, 'insect', 2], index=['dog', 'spider', 'swallow']),\n", " axis='rows')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Operations with two DataFrames are usually possible, and they operate element-wise based on both indices\n", "animals_bad_copy = animals.copy()\n", "animals_bad_copy.loc['dog', 'legs'] = 3\n", "animals_bad_copy.loc['spider', 'speed'] = 300.\n", "\n", "animals == animals_bad_copy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }