# Tutorial: Part 2 - Data structures

### Python lists

In [None]:
python_list = [2, 3, 5, 7, 11, 13, 17, 19]

In [None]:
# Access by index
python_list[3]

In [None]:
# Loop
for number in python_list:
    print(number)

In [None]:
# List operations
sum(python_list)

### Python dict

In [None]:
python_dict = {'two': 2,
               'three': 3,
               'five': 5,
               'seven': 7,
               'eleven': 11,
               'thirteen': 13,
               'seventeen': 17,
               'nineteen': 19}

In [None]:
# Access by label
python_dict['eleven']

In [None]:
# Access by position
python_dict[3]

In [None]:
# Loop
for name, number in python_dict.items():
    print(f'{name}: {number}')

### pandas Series

In [None]:
import pandas

pandas_series = pandas.Series(python_dict, name='primes')
pandas_series

In [None]:
# Access by label
pandas_series['eleven']

In [None]:
# Access by position
pandas_series[3]

In [None]:
# Looping is usually a bad practice (for performance reasons)
pandas_series.apply(print);

In [None]:
# Series operations
pandas_series.sum()

In [None]:
# Press <TAB> after the dot to see the list
pandas_series.

In [None]:
# Read the documentation
pandas_series?

In [None]:
# Most operations can be applied to the Series (all elements at a time)
pandas_series + 1

In [None]:
pandas_series < 10

In [None]:
# Slicing works as in Python lists
pandas_series[2:-2]

In [None]:
# Selection can be performed with a boolean mask
pandas_series[[True, False, True, False, True, False, True, False]]

In [None]:
# This is mainly useful with conditions
pandas_series[pandas_series < 10]

In [None]:
# Operating with different Series uses labels to align
another_series = pandas.Series({'eleven': 1, 'three': 0})
pandas_series * another_series

### Data types

In [None]:
# All the elements of the Series have the same type (the internal data representation)
pandas_series.dtype

In [None]:
# pandas uses numpy internally, and the main types are numpy types:
import numpy
numpy.bool, numpy.uint8, numpy.int64, numpy.float64

In [None]:
# For memory and speed smaller types (e.g. 8 bits over 64 bits are preferred), but they need to be big enough for our data
pandas.Series([0, 255], dtype=numpy.uint8) + 1

In [None]:
pandas.Series([0, 255], dtype=numpy.uint8) - 1

In [None]:
# By default, pandas uses the largest types, to avoid problems
pandas.Series([0, 255]).dtype

In [None]:
# pandas automatically change the type (upcast) when mixing data
int_series = pandas.Series([1, 5, 10], dtype=numpy.uint64)
int_series.iloc[1] = 5.5
int_series.dtype

The main types:
- object
  - float
    - int
      - bool
  - datetime
  - category

`object` is a Python object and can be anything (for example, strings are objects)

When mixing data of different types, the most specific type that can represent both is used. For example:
- bool + int -> int
- int + float -> float
- float + datetime -> object

In [None]:
bool_series = pandas.Series([True])
int_series = pandas.Series([100])
float_series = pandas.Series([3.141592])
datetime_series = pandas.Series(pandas.Timestamp('2018-01-01'))

In [None]:
pandas.concat([bool_series, int_series])

In [None]:
pandas.concat([int_series, float_series])

In [None]:
pandas.concat([float_series, datetime_series])

In [None]:
# object is the most flexible, but is very slow
large_series = pandas.Series(numpy.random.randint(0, 255, 100_000_000))
large_series_as_object = large_series.astype(object)

In [None]:
%timeit large_series.sum()

In [None]:
%timeit large_series_as_object.sum()

### pandas DataFrame

A `DataFrame` can be seen as a collection of `Series` that share the index.

In [None]:
legs = pandas.Series({'dog': 4, 'spider': 8, 'swallow': 2})
species = pandas.Series({'dog': 'mammal', 'spider': 'insect', 'swallow': 'bird'})
speed = pandas.Series({'dog': 8.49, 'spider': 0.53, 'swallow': 11.0})

animals = pandas.DataFrame({'legs': legs, 'species': species, 'speed': speed})
animals

In [None]:
# Every column of a DataFrame has a type
animals.dtypes

In [None]:
# If we select a column, it is returned as a regular Series
animals['speed']

In [None]:
# Same applies for a row, but types are generally upcasted
animals.loc['swallow']

In [None]:
# DataFrames can be transposed
animals.T

In [None]:
# But again, types are generally upcasted, as the types are always specified column-wise
animals.T.dtypes

In [None]:
# If we operate a DataFrame with a scalar, the operation is element-wise
animals * 2

In [None]:
animals

In [None]:
# Operations between DataFrame and Series are sometimes possible
animals == pandas.Series([2, 'insect', 3.141592], index=['legs', 'species', 'speed'])

In [None]:
# Operations also exist as methods, so the axis can be changed
animals.eq(pandas.Series([4, 'insect', 2], index=['dog', 'spider', 'swallow']),
           axis='rows')

In [None]:
# Operations with two DataFrames are usually possible, and they operate element-wise based on both indices
animals_bad_copy = animals.copy()
animals_bad_copy.loc['dog', 'legs'] = 3
animals_bad_copy.loc['spider', 'speed'] = 300.

animals == animals_bad_copy