In [1]:
# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 20

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [2]:
def ecdf(df, x, grouper=None):
    """
    Calculate empirical cumulative distribution function of a distribution
    
    :param df: dataframe
    :param x: string name of column
    :param grouper: string for column to groupby
    
    """
    df = df.copy()
    
    if grouper is not None:
        new_dfs = []
        for group, grouped_data in df.groupby(grouper):
            grouped_data.sort_values(x, ascending=True, inplace=True)
            n = len(grouped_data)
            grouped_data['y'] = np.arange(1, n + 1, step = 1) / n
            new_dfs.append(grouped_data)
        return pd.concat(new_dfs)
    
    # Sort by the column for distribution
    df.sort_values(x, ascending=True, inplace=True)
    n = len(df)
    # Calculate percentiles
    df['y'] = np.arange(1, n + 1, step=1) / n
    return df

In [3]:
wages = pd.read_csv('data/us_state_wages.csv')
wages.head()

Unnamed: 0,year,geo_name,geo,income
0,2013,Alabama,04000US01,43253.0
1,2013,Alaska,04000US02,70760.0
2,2013,Arizona,04000US04,49774.0
3,2013,Arkansas,04000US05,40768.0
4,2013,California,04000US06,61094.0


In [None]:
wages.tail()

In [6]:
figure = df.iplot(x='income', y='y', mode='markers+lines', categories='year', hline=[0.5],
         xTitle='income', yTitle='percentile', title='ECDF of Wages in US states', asFigure=True)
figure.data

(Scatter({
     'marker': {'color': 'rgba(255, 153, 51, 1.0)', 'line': {'width': 1.3}, 'opacity': 0.8, 'size': 12, 'symbol': 'circle'},
     'mode': 'markers+lines',
     'name': '2015',
     'textfont': {'color': '#4D5663'},
     'uid': 'cbec7bad-c388-403a-87c2-bdd7258b9336',
     'x': array([19350., 39665., 41371., 41751., 43623., 43740., 44963., 45047., 45219.,
                 45483., 46868., 46879., 47169., 47507., 47583., 48173., 49255., 49331.,
                 49429., 49576., 49620., 50255., 50957., 51243., 51847., 52205., 52997.,
                 53183., 53207., 53357., 53599., 55176., 56852., 57181., 57574., 58840.,
                 59269., 60509., 60629., 60727., 61062., 61492., 61818., 65015., 66779.,
                 68563., 69515., 70331., 70848., 72093., 72515., 74551.]),
     'y': array([0.00480769, 0.03365385, 0.05769231, 0.06730769, 0.10096154, 0.10576923,
                 0.14423077, 0.16346154, 0.16826923, 0.17788462, 0.22596154, 0.23076923,
                 0.24519

In [7]:
figure.layout.shapes

(layout.Shape({
     'line': {'color': '#db4052', 'dash': 'solid', 'width': 1},
     'type': 'line',
     'x0': 0,
     'x1': 1,
     'xref': 'paper',
     'y0': 0.5,
     'y1': 0.5,
     'yref': 'y'
 }),)

In [5]:
df = ecdf(wages, 'income')
df['year'] = df['year'].astype(str)
df.iplot(x='income', y='y', mode='markers+lines', categories='year', hline=[0.5],
         xTitle='income', yTitle='percentile', title='ECDF of Wages in US states')

In [8]:
df = ecdf(wages, 'income', grouper='year')
df['year'] = df['year'].astype(str)
df.iplot(x='income', y='y', mode='markers+lines', categories='year',
         xTitle='income', yTitle='percentile', title='ECDF of Wages in US states')

In [9]:
df = wages.copy()

In [70]:
df.loc[(df[x] - df[x].mean()).abs().idxmin(), 'y']

0.5769230769230769

In [71]:
x = 'income'
text = 'geo_name'
grouper = None
title_override = None

df = ecdf(df, x, grouper=grouper)

if grouper is not None:
    data = [
        go.Scatter(
            x=grouped_data[x],
            y=grouped_data['y'],
            mode='markers+lines',
            marker=dict(opacity=0.6, line=dict(color='black', width=2)),
            text=grouped_data[text],
            name=group) for group, grouped_data in df.groupby(grouper)
    ]

else:
    data = [
        go.Scatter(
            x=df[x],
            y=df['y'],
            mode='markers+lines',
            marker=dict(
                color='blue', opacity=0.6, line=dict(color='black', width=2)),
            text=df[text])
    ]

quantiles = [0.25, 0.5, 0.75]
    
shapes = [go.layout.Shape(type='line', x0=0, x1=1, y0=q, y1=q, xref='paper', yref='y',
                          line=dict(width=2.1, color = 'red', dash='dash')) for q in quantiles]

annotations = [go.layout.Annotation(x=0, y=q + 0.05, showarrow=False,
                                    xanchor='left', font=dict(size=14), xshift=20,
                                    text=f'{q}: {df[x].quantile(q):.2f}') for q in quantiles]

annotations.append(go.layout.Annotation(x=df[x].min(), font=dict(size=14), 
                                        y=0.05, text=f'Min: {df[x].min():.2f}'))
annotations.append(go.layout.Annotation(x=df[x].max(), font=dict(size=14),
                                        y=1, text=f'Max: {df[x].max():.2f}'))
annotations.append(go.layout.Annotation(x=df[x].mean(), font=dict(size=14),
                                        y=df.loc[(df[x] - df[x].mean()).abs().idxmin(), 'y'], 
                                        text=f'Mean: {df[x].mean():.2f}'))

x_nice = x.replace('_', '').title()
layout = go.Layout(
    annotations=annotations,
    shapes=shapes,
    xaxis=dict(title=x_nice),
    yaxis=dict(title='percentile'),
    title=f'ECDF of {x_nice}' if title_override is None else title_override)

figure = go.Figure(data=data, layout=layout)

In [72]:
annotations

[layout.Annotation({
     'font': {'size': 14}, 'showarrow': False, 'text': '0.25: 47303.50', 'x': 0, 'xanchor': 'left', 'xshift': 20, 'y': 0.3
 }), layout.Annotation({
     'font': {'size': 14}, 'showarrow': False, 'text': '0.5: 52406.50', 'x': 0, 'xanchor': 'left', 'xshift': 20, 'y': 0.55
 }), layout.Annotation({
     'font': {'size': 14}, 'showarrow': False, 'text': '0.75: 60347.75', 'x': 0, 'xanchor': 'left', 'xshift': 20, 'y': 0.8
 }), layout.Annotation({
     'font': {'size': 14}, 'text': 'Min: 19350.00', 'x': 19350.0, 'y': 0.05
 }), layout.Annotation({
     'font': {'size': 14}, 'text': 'Max: 76067.00', 'x': 76067.0, 'y': 1
 }), layout.Annotation({
     'font': {'size': 14}, 'text': 'Mean: 53908.78', 'x': 53908.778846153844, 'y': 0.5769230769230769
 })]

In [73]:
iplot(figure)

In [16]:
df.groupby('year')['income'].quantile([0.25, 0.5, 0.75])
df.groupby('year')['income'].mean()

year      
2013  0.25    46658.75
      0.50    51757.50
      0.75    58985.25
2014  0.25    47100.50
      0.50    52302.50
      0.75    59942.25
2015  0.25    47422.50
      0.50    52601.00
      0.75    60653.50
2016  0.25    48770.00
      0.50    53977.50
      0.75    62518.50
Name: income, dtype: float64

year
2013    52878.230769
2014    53468.788462
2015    53957.519231
2016    55330.576923
Name: income, dtype: float64