{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-01-11T02:25:31.017751Z", "start_time": "2019-01-11T02:25:29.745525Z" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Data science imports\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# Options for pandas\n", "pd.options.display.max_columns = 20\n", "\n", "# Display all cell outputs\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = 'all'\n", "\n", "import plotly.plotly as py\n", "import plotly.graph_objs as go\n", "import cufflinks\n", "from plotly.offline import iplot\n", "cufflinks.go_offline()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-01-11T02:25:31.035299Z", "start_time": "2019-01-11T02:25:31.029778Z" } }, "outputs": [], "source": [ "def ecdf(df, x, grouper=None):\n", " \"\"\"\n", " Calculate empirical cumulative distribution function of a distribution\n", " \n", " :param df: dataframe\n", " :param x: string name of column\n", " :param grouper: string for column to groupby\n", " \n", " \"\"\"\n", " df = df.copy()\n", " \n", " if grouper is not None:\n", " new_dfs = []\n", " for group, grouped_data in df.groupby(grouper):\n", " grouped_data.sort_values(x, ascending=True, inplace=True)\n", " n = len(grouped_data)\n", " grouped_data['y'] = np.arange(1, n + 1, step = 1) / n\n", " new_dfs.append(grouped_data)\n", " return pd.concat(new_dfs)\n", " \n", " # Sort by the column for distribution\n", " df.sort_values(x, ascending=True, inplace=True)\n", " n = len(df)\n", " # Calculate percentiles\n", " df['y'] = np.arange(1, n + 1, step=1) / n\n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-01-11T02:25:32.904642Z", "start_time": "2019-01-11T02:25:32.876203Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " | year | \n", "geo_name | \n", "geo | \n", "income | \n", "
---|---|---|---|---|
0 | \n", "2013 | \n", "Alabama | \n", "04000US01 | \n", "43253.0 | \n", "
1 | \n", "2013 | \n", "Alaska | \n", "04000US02 | \n", "70760.0 | \n", "
2 | \n", "2013 | \n", "Arizona | \n", "04000US04 | \n", "49774.0 | \n", "
3 | \n", "2013 | \n", "Arkansas | \n", "04000US05 | \n", "40768.0 | \n", "
4 | \n", "2013 | \n", "California | \n", "04000US06 | \n", "61094.0 | \n", "