6 年之前 · f8cb8b51e7
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,4 @@
 
				+*.pyc
			
 
				+*__pycache__
			
 
				 *DS_Store
			
 
				+medium/data/*_files
			
--- a/Plotting.ipynb
+++ b/Plotting.ipynb
--- a/copernican/Copernican
+++ b/copernican/Copernican
--- a/Plotly.ipynb
+++ b/Plotly.ipynb
--- a/ecdf/data/us_state_wages.csv
+++ b/ecdf/data/us_state_wages.csv
@@ -0,0 +1,209 @@
 
				+year,geo_name,geo,income
			
 
				+2013,"Alabama","04000US01",43253.0
			
 
				+2013,"Alaska","04000US02",70760.0
			
 
				+2013,"Arizona","04000US04",49774.0
			
 
				+2013,"Arkansas","04000US05",40768.0
			
 
				+2013,"California","04000US06",61094.0
			
 
				+2013,"Colorado","04000US08",58433.0
			
 
				+2013,"Connecticut","04000US09",69461.0
			
 
				+2013,"Delaware","04000US10",59878.0
			
 
				+2013,"District of Columbia","04000US11",65830.0
			
 
				+2013,"Florida","04000US12",46956.0
			
 
				+2013,"Georgia","04000US13",49179.0
			
 
				+2013,"Hawaii","04000US15",67402.0
			
 
				+2013,"Idaho","04000US16",46767.0
			
 
				+2013,"Illinois","04000US17",56797.0
			
 
				+2013,"Indiana","04000US18",48248.0
			
 
				+2013,"Iowa","04000US19",51843.0
			
 
				+2013,"Kansas","04000US20",51332.0
			
 
				+2013,"Kentucky","04000US21",43036.0
			
 
				+2013,"Louisiana","04000US22",44874.0
			
 
				+2013,"Maine","04000US23",48453.0
			
 
				+2013,"Maryland","04000US24",73538.0
			
 
				+2013,"Massachusetts","04000US25",66866.0
			
 
				+2013,"Michigan","04000US26",48411.0
			
 
				+2013,"Minnesota","04000US27",59836.0
			
 
				+2013,"Mississippi","04000US28",39031.0
			
 
				+2013,"Missouri","04000US29",47380.0
			
 
				+2013,"Montana","04000US30",46230.0
			
 
				+2013,"Nebraska","04000US31",51672.0
			
 
				+2013,"Nevada","04000US32",52800.0
			
 
				+2013,"New Hampshire","04000US33",64916.0
			
 
				+2013,"New Jersey","04000US34",71629.0
			
 
				+2013,"New Mexico","04000US35",44927.0
			
 
				+2013,"New York","04000US36",58003.0
			
 
				+2013,"North Carolina","04000US37",46334.0
			
 
				+2013,"North Dakota","04000US38",53741.0
			
 
				+2013,"Ohio","04000US39",48308.0
			
 
				+2013,"Oklahoma","04000US40",45339.0
			
 
				+2013,"Oregon","04000US41",50229.0
			
 
				+2013,"Pennsylvania","04000US42",52548.0
			
 
				+2013,"Rhode Island","04000US44",56361.0
			
 
				+2013,"South Carolina","04000US45",44779.0
			
 
				+2013,"South Dakota","04000US46",49495.0
			
 
				+2013,"Tennessee","04000US47",44298.0
			
 
				+2013,"Texas","04000US48",51900.0
			
 
				+2013,"Utah","04000US49",58821.0
			
 
				+2013,"Vermont","04000US50",54267.0
			
 
				+2013,"Virginia","04000US51",63907.0
			
 
				+2013,"Washington","04000US53",59478.0
			
 
				+2013,"West Virginia","04000US54",41043.0
			
 
				+2013,"Wisconsin","04000US55",52413.0
			
 
				+2013,"Wyoming","04000US56",57406.0
			
 
				+2013,"Puerto Rico","04000US72",19624.0
			
 
				+2014,"Alabama","04000US01",43511.0
			
 
				+2014,"Alaska","04000US02",71829.0
			
 
				+2014,"Arizona","04000US04",49928.0
			
 
				+2014,"Arkansas","04000US05",41264.0
			
 
				+2014,"California","04000US06",61489.0
			
 
				+2014,"Colorado","04000US08",59448.0
			
 
				+2014,"Connecticut","04000US09",69899.0
			
 
				+2014,"Delaware","04000US10",60231.0
			
 
				+2014,"District of Columbia","04000US11",69235.0
			
 
				+2014,"Florida","04000US12",47212.0
			
 
				+2014,"Georgia","04000US13",49342.0
			
 
				+2014,"Hawaii","04000US15",68201.0
			
 
				+2014,"Idaho","04000US16",47334.0
			
 
				+2014,"Illinois","04000US17",57166.0
			
 
				+2014,"Indiana","04000US18",48737.0
			
 
				+2014,"Iowa","04000US19",52716.0
			
 
				+2014,"Kansas","04000US20",51872.0
			
 
				+2014,"Kentucky","04000US21",43342.0
			
 
				+2014,"Louisiana","04000US22",44991.0
			
 
				+2014,"Maine","04000US23",48804.0
			
 
				+2014,"Maryland","04000US24",74149.0
			
 
				+2014,"Massachusetts","04000US25",67846.0
			
 
				+2014,"Michigan","04000US26",49087.0
			
 
				+2014,"Minnesota","04000US27",60828.0
			
 
				+2014,"Mississippi","04000US28",39464.0
			
 
				+2014,"Missouri","04000US29",47764.0
			
 
				+2014,"Montana","04000US30",46766.0
			
 
				+2014,"Nebraska","04000US31",52400.0
			
 
				+2014,"Nevada","04000US32",52205.0
			
 
				+2014,"New Hampshire","04000US33",65986.0
			
 
				+2014,"New Jersey","04000US34",72062.0
			
 
				+2014,"New Mexico","04000US35",44968.0
			
 
				+2014,"New York","04000US36",58687.0
			
 
				+2014,"North Carolina","04000US37",46693.0
			
 
				+2014,"North Dakota","04000US38",55579.0
			
 
				+2014,"Ohio","04000US39",48849.0
			
 
				+2014,"Oklahoma","04000US40",46235.0
			
 
				+2014,"Oregon","04000US41",50521.0
			
 
				+2014,"Pennsylvania","04000US42",53115.0
			
 
				+2014,"Rhode Island","04000US44",56423.0
			
 
				+2014,"South Carolina","04000US45",45033.0
			
 
				+2014,"South Dakota","04000US46",50338.0
			
 
				+2014,"Tennessee","04000US47",44621.0
			
 
				+2014,"Texas","04000US48",52576.0
			
 
				+2014,"Utah","04000US49",59846.0
			
 
				+2014,"Vermont","04000US50",54447.0
			
 
				+2014,"Virginia","04000US51",64792.0
			
 
				+2014,"Washington","04000US53",60294.0
			
 
				+2014,"West Virginia","04000US54",41576.0
			
 
				+2014,"Wisconsin","04000US55",52738.0
			
 
				+2014,"Wyoming","04000US56",58252.0
			
 
				+2014,"Puerto Rico","04000US72",19686.0
			
 
				+2015,"Alaska","04000US02",72515.0
			
 
				+2015,"Alabama","04000US01",43623.0
			
 
				+2015,"Arkansas","04000US05",41371.0
			
 
				+2015,"Arizona","04000US04",50255.0
			
 
				+2015,"California","04000US06",61818.0
			
 
				+2015,"Colorado","04000US08",60629.0
			
 
				+2015,"Connecticut","04000US09",70331.0
			
 
				+2015,"District of Columbia","04000US11",70848.0
			
 
				+2015,"Delaware","04000US10",60509.0
			
 
				+2015,"Florida","04000US12",47507.0
			
 
				+2015,"Georgia","04000US13",49620.0
			
 
				+2015,"Hawaii","04000US15",69515.0
			
 
				+2015,"Iowa","04000US19",53183.0
			
 
				+2015,"Idaho","04000US16",47583.0
			
 
				+2015,"Illinois","04000US17",57574.0
			
 
				+2015,"Indiana","04000US18",49255.0
			
 
				+2015,"Kansas","04000US20",52205.0
			
 
				+2015,"Kentucky","04000US21",43740.0
			
 
				+2015,"Louisiana","04000US22",45047.0
			
 
				+2015,"Massachusetts","04000US25",68563.0
			
 
				+2015,"Maryland","04000US24",74551.0
			
 
				+2015,"Maine","04000US23",49331.0
			
 
				+2015,"Michigan","04000US26",49576.0
			
 
				+2015,"Minnesota","04000US27",61492.0
			
 
				+2015,"Missouri","04000US29",48173.0
			
 
				+2015,"Mississippi","04000US28",39665.0
			
 
				+2015,"Montana","04000US30",47169.0
			
 
				+2015,"North Carolina","04000US37",46868.0
			
 
				+2015,"North Dakota","04000US38",57181.0
			
 
				+2015,"Nebraska","04000US31",52997.0
			
 
				+2015,"New Hampshire","04000US33",66779.0
			
 
				+2015,"New Jersey","04000US34",72093.0
			
 
				+2015,"New Mexico","04000US35",44963.0
			
 
				+2015,"Nevada","04000US32",51847.0
			
 
				+2015,"New York","04000US36",59269.0
			
 
				+2015,"Ohio","04000US39",49429.0
			
 
				+2015,"Oklahoma","04000US40",46879.0
			
 
				+2015,"Oregon","04000US41",51243.0
			
 
				+2015,"Pennsylvania","04000US42",53599.0
			
 
				+2015,"Puerto Rico","04000US72",19350.0
			
 
				+2015,"Rhode Island","04000US44",56852.0
			
 
				+2015,"South Carolina","04000US45",45483.0
			
 
				+2015,"South Dakota","04000US46",50957.0
			
 
				+2015,"Tennessee","04000US47",45219.0
			
 
				+2015,"Texas","04000US48",53207.0
			
 
				+2015,"Utah","04000US49",60727.0
			
 
				+2015,"Virginia","04000US51",65015.0
			
 
				+2015,"Vermont","04000US50",55176.0
			
 
				+2015,"Washington","04000US53",61062.0
			
 
				+2015,"Wisconsin","04000US55",53357.0
			
 
				+2015,"West Virginia","04000US54",41751.0
			
 
				+2015,"Wyoming","04000US56",58840.0
			
 
				+2016,"Alabama","04000US01",44758.0
			
 
				+2016,"Alaska","04000US02",74444.0
			
 
				+2016,"Arizona","04000US04",51340.0
			
 
				+2016,"Arkansas","04000US05",42336.0
			
 
				+2016,"California","04000US06",63783.0
			
 
				+2016,"Colorado","04000US08",62520.0
			
 
				+2016,"Connecticut","04000US09",71755.0
			
 
				+2016,"Delaware","04000US10",61017.0
			
 
				+2016,"District of Columbia","04000US11",72935.0
			
 
				+2016,"Florida","04000US12",48900.0
			
 
				+2016,"Georgia","04000US13",51037.0
			
 
				+2016,"Hawaii","04000US15",71977.0
			
 
				+2016,"Idaho","04000US16",49174.0
			
 
				+2016,"Illinois","04000US17",59196.0
			
 
				+2016,"Indiana","04000US18",50433.0
			
 
				+2016,"Iowa","04000US19",54570.0
			
 
				+2016,"Kansas","04000US20",53571.0
			
 
				+2016,"Kentucky","04000US21",44811.0
			
 
				+2016,"Louisiana","04000US22",45652.0
			
 
				+2016,"Maine","04000US23",50826.0
			
 
				+2016,"Maryland","04000US24",76067.0
			
 
				+2016,"Massachusetts","04000US25",70954.0
			
 
				+2016,"Michigan","04000US26",50803.0
			
 
				+2016,"Minnesota","04000US27",63217.0
			
 
				+2016,"Mississippi","04000US28",40528.0
			
 
				+2016,"Missouri","04000US29",49593.0
			
 
				+2016,"Montana","04000US30",48380.0
			
 
				+2016,"Nebraska","04000US31",54384.0
			
 
				+2016,"Nevada","04000US32",53094.0
			
 
				+2016,"New Hampshire","04000US33",68485.0
			
 
				+2016,"New Jersey","04000US34",73702.0
			
 
				+2016,"New Mexico","04000US35",45674.0
			
 
				+2016,"New York","04000US36",60741.0
			
 
				+2016,"North Carolina","04000US37",48256.0
			
 
				+2016,"North Dakota","04000US38",59114.0
			
 
				+2016,"Ohio","04000US39",50674.0
			
 
				+2016,"Oklahoma","04000US40",48038.0
			
 
				+2016,"Oregon","04000US41",53270.0
			
 
				+2016,"Pennsylvania","04000US42",54895.0
			
 
				+2016,"Rhode Island","04000US44",58387.0
			
 
				+2016,"South Carolina","04000US45",46898.0
			
 
				+2016,"South Dakota","04000US46",52078.0
			
 
				+2016,"Tennessee","04000US47",46574.0
			
 
				+2016,"Texas","04000US48",54727.0
			
 
				+2016,"Utah","04000US49",62518.0
			
 
				+2016,"Vermont","04000US50",56104.0
			
 
				+2016,"Virginia","04000US51",66149.0
			
 
				+2016,"Washington","04000US53",62848.0
			
 
				+2016,"West Virginia","04000US54",42644.0
			
 
				+2016,"Wisconsin","04000US55",54610.0
			
 
				+2016,"Wyoming","04000US56",59143.0
			
 
				+2016,"Puerto Rico","04000US72",19606.0
			
--- a/ecdf/data/us_wages_jobs.csv
+++ b/ecdf/data/us_wages_jobs.csv
--- a/medium/.pytest_cache/v/cache/nodeids
+++ b/medium/.pytest_cache/v/cache/nodeids
@@ -0,0 +1 @@
 
				+[]
			
--- a/medium/2019-01-13_stats
+++ b/medium/2019-01-13_stats
--- a/medium/Development.ipynb
+++ b/medium/Development.ipynb
--- a/medium/Fitting.ipynb
+++ b/medium/Fitting.ipynb
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
--- a/Progress.ipynb
+++ b/Progress.ipynb
--- a/medium/bargraphs.py
+++ b/medium/bargraphs.py
@@ -0,0 +1,79 @@
 
				+from bs4 import BeautifulSoup
			
 
				+from selenium import webdriver
			
 
				+from dateutil import parser
			
 
				+from datetime import datetime, timedelta
			
 
				+import pandas as pd
			
 
				+import math
			
 
				+import time
			
 
				+
			
 
				+driver = webdriver.Chrome()
			
 
				+driver.get("https://medium.com/me/stats")
			
 
				+input('Waiting for you to log in. Press enter when ready: ')
			
 
				+earliest_article_date = parser.parse(
			
 
				+    input('Enter earliest article date as string: ')).date()
			
 
				+days = (datetime.now().date()
			
 
				+        - earliest_article_date).total_seconds() / (60 * 60 * 24)
			
 
				+months = math.ceil(days / 30)
			
 
				+
			
 
				+
			
 
				+def get_all_pages(driver, xpath, months):
			
 
				+
			
 
				+    # Initially starting at today
			
 
				+    latest_date_in_graph = datetime.now().date()
			
 
				+
			
 
				+    print('Starting on ', latest_date_in_graph)
			
 
				+
			
 
				+    views = []
			
 
				+    dates = []
			
 
				+
			
 
				+    # Iterate through the graphs
			
 
				+    for m in range(months + 1):
			
 
				+        graph_views = []
			
 
				+        graph_dates = []
			
 
				+        # Extract the bar graph
			
 
				+        bargraph = BeautifulSoup(driver.page_source).find_all(
			
 
				+            attrs={'class': 'bargraph'})[0]
			
 
				+
			
 
				+        # Get all the bars in the bargraph
			
 
				+        bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
			
 
				+        # Sort the bar data by x position (which will be date order) with most recent first
			
 
				+        bardata = sorted(bardata, key=lambda x: float(
			
 
				+            x.get('x')), reverse=True)
			
 
				+        bardata = [bar.get('data-tooltip') for bar in bardata]
			
 
				+        latest_day = int(bardata[0].split('\xa0')[-1])
			
 
				+
			
 
				+        # Some months are not overlapping
			
 
				+        if latest_day != latest_date_in_graph.day:
			
 
				+            latest_date_in_graph -= timedelta(days=1)
			
 
				+
			
 
				+        # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
			
 
				+        for i, data in enumerate(bardata):
			
 
				+            graph_views.append(float(data.split(' ')[0].replace(',', '')))
			
 
				+            graph_dates.append(latest_date_in_graph - timedelta(days=i))
			
 
				+
			
 
				+        views.extend(graph_views)
			
 
				+        dates.extend(graph_dates)
			
 
				+        # Find the earliest date in the graph
			
 
				+        earliest_date_in_graph = graph_dates[-1]
			
 
				+
			
 
				+        # Update the latest date in the next graph
			
 
				+        latest_date_in_graph = earliest_date_in_graph
			
 
				+
			
 
				+        # Go to the previous graph
			
 
				+        driver.find_element_by_xpath(xpath).click()
			
 
				+        time.sleep(2)
			
 
				+        print(f'{100 * m /(months)}% complete.', end='\r')
			
 
				+
			
 
				+    results = pd.DataFrame({'date': pd.to_datetime(
			
 
				+        dates), 'views': views}).groupby('date').sum()
			
 
				+    results = results.loc[results[results['views'] != 0.0].index.min():, ]
			
 
				+    print('First views on ', str(results.index.min().date()))
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+xpath = input('Paste xpath as string: ')
			
 
				+
			
 
				+results = get_all_pages(driver, xpath, months)
			
 
				+fname = f'{str(datetime.now().date())}_stats'
			
 
				+results.to_parquet(fname)
			
 
				+print('Stats saved to ', fname)
			
--- a/medium/data/medium_data_2019_01_06
+++ b/medium/data/medium_data_2019_01_06
--- a/medium/data/published.html
+++ b/medium/data/published.html
--- a/medium/data/stats-responses.html
+++ b/medium/data/stats-responses.html
--- a/medium/data/stats.html
+++ b/medium/data/stats.html
--- a/medium/data/unlisted.html
+++ b/medium/data/unlisted.html
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/Screen
+++ b/medium/images/Screen
--- a/medium/images/annotations.png
+++ b/medium/images/annotations.png
--- a/medium/images/corrplot.png
+++ b/medium/images/corrplot.png
--- a/medium/images/data_for_fitting.py
+++ b/medium/images/data_for_fitting.py
@@ -0,0 +1,24 @@
 
				+def data_for_fitting(*, building_id, date):
			
 
				+    """
			
 
				+    Retrieves data for fitting from the previous business day
			
 
				+    taking into account holidays
			
 
				+    """
			
 
				+
			
 
				+    lease_start = None
			
 
				+    while lease_start is None:
			
 
				+        # Previous business day according to Pandas (might be a holiday)
			
 
				+        previous_bday = pd.to_datetime(date) - BDay(1)
			
 
				+
			
 
				+        # If a holiday, this will return None
			
 
				+        lease_start = (db().execute(building_daily_stats.select().where(
			
 
				+            building_daily_stats.c.building_id == building_id).where(
			
 
				+                building_daily_stats.c.date == previous_bday)).fetchone().
			
 
				+                       lease_obligations_start_at)
			
 
				+
			
 
				+        date = previous_bday
			
 
				+
			
 
				+    # Retrieve 8 hours of data from the lease start
			
 
				+    return load_sensor_values(
			
 
				+        building_id=building_id,
			
 
				+        start_time=lease_start,
			
 
				+        end_time=lease_start + timedelta(hours=8))
			
--- a/medium/images/dist_by_pub.png
+++ b/medium/images/dist_by_pub.png
--- a/medium/images/plotting_enjoyment.png
+++ b/medium/images/plotting_enjoyment.png
--- a/medium/images/read_wordcount.png
+++ b/medium/images/read_wordcount.png
--- a/medium/images/reads_vs_word_count.png
+++ b/medium/images/reads_vs_word_count.png
--- a/medium/images/scatterplot_matrix.png
+++ b/medium/images/scatterplot_matrix.png
--- a/medium/images/shutterstock_196082240.jpg
+++ b/medium/images/shutterstock_196082240.jpg
--- a/medium/images/stat_graph.png
+++ b/medium/images/stat_graph.png
--- a/medium/images/stats-saving-medium.gif
+++ b/medium/images/stats-saving-medium.gif
--- a/medium/images/table.png
+++ b/medium/images/table.png
--- a/medium/images/wiki_search.gif
+++ b/medium/images/wiki_search.gif
--- a/medium/images/word_count_and_views.png
+++ b/medium/images/word_count_and_views.png
--- a/medium/medium_views_time
+++ b/medium/medium_views_time
--- a/medium/readme.md
+++ b/medium/readme.md
@@ -0,0 +1,60 @@
 
				+# Tools for analyzing Medium article statistics
			
 
				+
			
 
				+The Medium stats Python toolkit is a suite of tools for retrieving, analyzing, predicting, and visualizing
			
 
				+your Medium article stats. You can also run on my Medium statistics
			
 
				+which are located in `data/`
			
 
				+
			
 
				+* Note: running on Mac may first require setting
			
 
				+    `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`
			
 
				+    from the command line [to enable parallel processing](https://stackoverflow.com/questions/50168647/multiprocessing-causes-python-to-crash-and-gives-an-error-may-have-been-in-progr)
			
 
				+
			
 
				+* For complete usage refer to `Medium Stats Analysis`
			
 
				+* Data retrieval code lives in `retrieval.py`
			
 
				+* Visualization and analysis code is in `visuals.py`
			
 
				+* See also the Medium article ["Medium Analysis in Python"](https://medium.com/@williamkoehrsen/analyzing-medium-story-stats-with-python-24c6491a8ff0)
			
 
				+* Contributions are welcome and appreciated
			
 
				+* For help contact wjk68@case.edu or twitter.com/@koehrsen_will
			
 
				+
			
 
				+## Basic usage
			
 
				+
			
 
				+### Use your own Medium statistics
			
 
				+1. Go to the stats page https://medium.com/me/stats
			
 
				+2. Scroll all the way down to the bottom so all the articles are loaded
			
 
				+3. Right click, and hit 'save as'
			
 
				+4. Save the file as `stats.html` in the `data/` directory. You can also save the responses to do a similar analysis.
			
 
				+
			
 
				+![](images/stats-saving-medium.gif)
			
 
				+
			
 
				+If you don't do this, you can still go to the next step and use the provided data!
			
 
				+
			
 
				+## Retrieving Statistics
			
 
				+
			
 
				+* Open up a Jupyter Notebook or Python terminal in the `medium/` directory
			
 
				+and run
			
 
				+
			
 
				+```
			
 
				+from retrieval import get_data
			
 
				+df = get_data(fname='stats.html')
			
 
				+```
			
 
				+
			
 
				+## Analysis and Visualization
			
 
				+
			
 
				+* Interactive plots are not rendered on GitHub. To view the plots with their full
			
 
				+capability, use NBviewer ([`Medium Stats Analysis` on NBviewer](https://nbviewer.jupyter.org/github/WillKoehrsen/Data-Analysis/blob/master/medium/Medium%20Stats%20Analysis.ipynb))
			
 
				+* All plots can be opened in the plotly online editor to finish up for publication
			
 
				+
			
 
				+
			
 
				+* __Histogram__: `make_hist(df, x, category=None)`
			
 
				+* __Cumulative plot__: `make_cum_plot(df, y, category=None, ranges=False)`
			
 
				+* __Scatter plots__: `make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None, ranges=False, title_override=None)`
			
 
				+* __Scatter plot with three variables__: pass in `category` or `scale` to `make_scatter_plot`
			
 
				+* __Univariate Linear Regression__: `make_linear_regression(df, x, y, intercept_0)`
			
 
				+* __Univariate polynomial fitting__: `make_poly_fits(df, x, y, degree=6)`
			
 
				+* __Multivariate Linear Regression__: pass in list of `x` to `make_linear_regression`
			
 
				+* __Future extrapolation__: `make_extrapolation(df, y, years, degree=4)`
			
 
				+
			
 
				+
			
 
				+* More methods will be coming soon!
			
 
				+* Submit pull requests with your own code, or open issues for suggestions!
			
 
				+
			
 
				+
			
--- a/medium/retrieval.py
+++ b/medium/retrieval.py
@@ -0,0 +1,226 @@
 
				+from multiprocessing import Pool
			
 
				+import requests
			
 
				+import re
			
 
				+from bs4 import BeautifulSoup
			
 
				+from itertools import chain
			
 
				+from collections import Counter
			
 
				+from timeit import default_timer as timer
			
 
				+import pandas as pd
			
 
				+from datetime import datetime
			
 
				+
			
 
				+
			
 
				+def get_table_rows(fname='stats.html'):
			
 
				+    """
			
 
				+    Extract the table rows from the statistics
			
 
				+
			
 
				+    :param fname: string name of the file stored in `data` directory
			
 
				+
			
 
				+    :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
			
 
				+    """
			
 
				+
			
 
				+    soup = BeautifulSoup(open(f'data/{fname}', 'r'), features='lxml')
			
 
				+    table_rows = soup.find_all(
			
 
				+        attrs={'class': "sortableTable-row js-statsTableRow"})
			
 
				+    print(f'Found {len(table_rows)} entries in table.')
			
 
				+    return table_rows
			
 
				+
			
 
				+
			
 
				+def convert_timestamp(ts: int, tz: str):
			
 
				+    """Convert a unix timestamp to a date timestamp"""
			
 
				+    return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
			
 
				+
			
 
				+
			
 
				+def process_entry(entry, parallel=True, tz='America/Chicago'):
			
 
				+    """
			
 
				+    Extract data from one entry in table
			
 
				+
			
 
				+    :param entry: BeautifulSoup tag
			
 
				+    :param parallel: Boolean for whether function is being run in parallel
			
 
				+    :param tz: string representing timezone for started and published time
			
 
				+
			
 
				+    :return entry_dict: dictionary with data about entry
			
 
				+
			
 
				+    """
			
 
				+    # Convert to soup when running in parallel
			
 
				+    if parallel:
			
 
				+        entry = BeautifulSoup(entry, features='lxml').body.tr
			
 
				+
			
 
				+    entry_dict = {}
			
 
				+    # Extract information
			
 
				+    for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
			
 
				+                          ['published_date', 'views', 'reads', 'ratio', 'fans']):
			
 
				+        entry_dict[key] = float(
			
 
				+            value.text) if key == 'ratio' else int(value.text)
			
 
				+
			
 
				+    entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
			
 
				+                                  0].get('title').split(' ')[0])
			
 
				+
			
 
				+    # Unlisted vs published
			
 
				+    entry_dict['type'] = 'unlisted' if len(
			
 
				+        entry.find_all(text=' Unlisted')) > 0 else 'published'
			
 
				+
			
 
				+    # Publication
			
 
				+    publication = entry.find_all(attrs={'class': 'sortableTable-text'})
			
 
				+    if 'In' in publication[0].text:
			
 
				+        entry_dict['publication'] = publication[0].text.split('In ')[
			
 
				+            1].split('View')[0]
			
 
				+    else:
			
 
				+        entry_dict['publication'] = 'None'
			
 
				+
			
 
				+    # Convert datetimes
			
 
				+    entry_dict['published_date'] = convert_timestamp(
			
 
				+        entry_dict['published_date'], tz=tz)
			
 
				+    entry_dict['started_date'] = convert_timestamp(
			
 
				+        entry.get('data-timestamp'), tz=tz)
			
 
				+
			
 
				+    # Get the link
			
 
				+    link = entry.find_all(text='View story',
			
 
				+                               attrs={'class': 'sortableTable-link'})[0].get('href')
			
 
				+    entry_dict['link'] = link
			
 
				+    # Retrieve the article and create a soup
			
 
				+    entry = requests.get(link).content
			
 
				+    entry_soup = BeautifulSoup(entry, features='lxml')
			
 
				+
			
 
				+    # Get the title
			
 
				+    try:
			
 
				+        title = entry_soup.h1.text
			
 
				+    except:
			
 
				+        title = 'response'
			
 
				+
			
 
				+    title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
			
 
				+
			
 
				+    # Main text entries
			
 
				+    entry_text = [p.text for p in entry_soup.find_all(
			
 
				+        ['h1', 'h2', 'h3', 'p', 'blockquote'])]
			
 
				+
			
 
				+    # Make sure to catch everything
			
 
				+    entry_text.extend(s.text for s in entry_soup.find_all(
			
 
				+        attrs={'class': 'graf graf--li graf-after--li'}))
			
 
				+    entry_text.extend(s.text for s in entry_soup.find_all(
			
 
				+        attrs={'class': 'graf graf--li graf-after--p'}))
			
 
				+    entry_text.extend(s.text for s in entry_soup.find_all(
			
 
				+        attrs={'class': 'graf graf--li graf-after--blockquote'}))
			
 
				+    entry_text.extend(s.text for s in entry_soup.find_all(
			
 
				+        attrs={'class': 'graf graf--li graf-after--pullquote'}))
			
 
				+
			
 
				+    entry_text = ' '.join(entry_text)
			
 
				+
			
 
				+    # Word count
			
 
				+    word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
			
 
				+
			
 
				+    # Number of claps
			
 
				+    clap_pattern = re.compile(
			
 
				+        '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
			
 
				+    claps = entry_soup.find_all(text=clap_pattern)
			
 
				+
			
 
				+    if len(claps) > 0:
			
 
				+        if 'K' in claps[0]:
			
 
				+            clap_number = int(1e3 * float(claps[0].split('K')[0]))
			
 
				+        else:
			
 
				+            clap_number = int(claps[0].split(' ')[0])
			
 
				+    else:
			
 
				+        clap_number = 0
			
 
				+
			
 
				+    # Post tags
			
 
				+    tags = entry_soup.find_all(
			
 
				+        attrs={'class': 'tags tags--postTags tags--borderless'})
			
 
				+    tags = [li.text for li in tags[0].find_all('li')]
			
 
				+
			
 
				+    # Responses to entry
			
 
				+    responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
			
 
				+                                           'data-action': 'scroll-to-responses'})
			
 
				+    num_responses = int(responses[0].text) if len(responses) > 0 else 0
			
 
				+
			
 
				+    # Store in dictionary
			
 
				+    entry_dict['title'] = title
			
 
				+    entry_dict['title_word_count'] = title_word_count
			
 
				+    entry_dict['text'] = entry_text
			
 
				+    entry_dict['word_count'] = word_count
			
 
				+    entry_dict['claps'] = clap_number
			
 
				+    entry_dict['tags'] = tags
			
 
				+    entry_dict['num_responses'] = num_responses
			
 
				+
			
 
				+    # Time since publication
			
 
				+    entry_dict['days_since_publication'] = (
			
 
				+        datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
			
 
				+
			
 
				+    return entry_dict
			
 
				+
			
 
				+
			
 
				+def process_in_parallel(table_rows, processes=20):
			
 
				+    """
			
 
				+    Process all the stats in a table in parallel
			
 
				+
			
 
				+    :note: make sure to set the correct time zone in `process_entry`
			
 
				+    :note: running on Mac may first require setting
			
 
				+    export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
			
 
				+    from the command line to enable parallel processing
			
 
				+
			
 
				+    :param table_rows: BeautifulSoup table rows
			
 
				+
			
 
				+    :param processes: integer number of processes (threads) to use in parallel
			
 
				+
			
 
				+    :return df: dataframe of information about each post
			
 
				+
			
 
				+    """
			
 
				+    # Convert to strings for multiprocessing
			
 
				+    table_rows_str = [str(r) for r in table_rows]
			
 
				+
			
 
				+    # Process each article in paralllel
			
 
				+    pool = Pool(processes=processes)
			
 
				+    results = []
			
 
				+    start = timer()
			
 
				+    for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
			
 
				+        # Report progress
			
 
				+        print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
			
 
				+        results.append(r)
			
 
				+    pool.close()
			
 
				+    pool.join()
			
 
				+    end = timer()
			
 
				+    print(
			
 
				+        f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
			
 
				+
			
 
				+    # Convert to dataframe
			
 
				+    df = pd.DataFrame(results)
			
 
				+    # Rename ratio
			
 
				+    df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
			
 
				+    # Add extra columns with more data
			
 
				+    df['claps_per_word'] = df['claps'] / df['word_count']
			
 
				+    df['editing_days'] = ((df['published_date'] - df['started_date']
			
 
				+                           ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
			
 
				+
			
 
				+    # Rounding
			
 
				+    df['published_date'] = df['published_date'].dt.round('min')
			
 
				+    df['started_date'] = df['started_date'].dt.round('min')
			
 
				+    df['read_ratio'] = df['read_ratio'].round(2)
			
 
				+
			
 
				+    # 5 most common tags (might want to include more tags)
			
 
				+    n = 5
			
 
				+    all_tags = list(chain(*df['tags'].tolist()))
			
 
				+    tag_counts = Counter(all_tags)
			
 
				+    tags = tag_counts.most_common(n)
			
 
				+
			
 
				+    # Adding columns with indication of tag
			
 
				+    for tag, count in tags:
			
 
				+        flag = [1 if tag in tags else 0 for tags in df['tags']]
			
 
				+        df.loc[:, f'<tag>{tag}'] = flag
			
 
				+
			
 
				+    df.sort_values('published_date', inplace=True)
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+def get_data(fname='stats.html', processes=20):
			
 
				+    """
			
 
				+    Retrieve medium article statistics
			
 
				+
			
 
				+    :note: running on Mac may first require setting
			
 
				+    export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
			
 
				+    from the command line to enable parallel processing
			
 
				+
			
 
				+    :param fname: file name (should be 'stats.html')
			
 
				+    :param processes: integer number of processes
			
 
				+
			
 
				+    :return df: dataframe of article data
			
 
				+    """
			
 
				+    t = get_table_rows(fname=fname)
			
 
				+    return process_in_parallel(t, processes=processes)
			
--- a/medium/view_extraction.py
+++ b/medium/view_extraction.py
@@ -0,0 +1,38 @@
 
				+import os
			
 
				+from bs4 import BeautifulSoup
			
 
				+from dateutil import parser
			
 
				+from datetime import datetime
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def process_bargraph(bargraph):
			
 
				+    bardata = [bar.get('data-tooltip')
			
 
				+               for bar in bargraph.find_all(attrs={'class': 'bargraph-bar'})]
			
 
				+    print(len(bardata))
			
 
				+    return
			
 
				+    # Sort by xposition
			
 
				+    bardata = sorted(bardata, key=lambda x: float(x.get('x')))
			
 
				+    views = [float(s.split(' ')[0].replace(',', '')) for s in bardata]
			
 
				+    dates = [s.split(' ')[-1].split('\xa0')[0] + ' '
			
 
				+             + s.split(' ')[-1].split('\xa0')[1] for s in bardata]
			
 
				+    year = str((datetime.now() - pd.Timedelta(days=i * 30)).year)
			
 
				+    dates = [parser.parse(d + ' ' + year) for d in dates]
			
 
				+    return views, dates
			
 
				+
			
 
				+
			
 
				+files = os.listdir('html_pages')
			
 
				+
			
 
				+v = []
			
 
				+d = []
			
 
				+
			
 
				+for fid in files:
			
 
				+    i = int(fid.split('.')[0].split('p')[1])
			
 
				+    graph = BeautifulSoup(
			
 
				+        open(f'html_pages/{fid}', 'r')).find_all(attrs={'class': 'bargraph'})[0]
			
 
				+    r = process_bargraph(graph, i)
			
 
				+    v.extend(r[0])
			
 
				+    d.extend(r[1])
			
 
				+    results = pd.DataFrame({'date': d, 'views': v})
			
 
				+
			
 
				+results['date'] = pd.to_datetime(results['date'])
			
 
				+results.to_parquet('medium_views_time')
			
--- a/medium/visuals.py
+++ b/medium/visuals.py
@@ -0,0 +1,390 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import statsmodels.api as sm
			
 
				+from sklearn.linear_model import LinearRegression
			
 
				+from sklearn.metrics import mean_squared_error
			
 
				+
			
 
				+
			
 
				+from scipy import stats
			
 
				+
			
 
				+import plotly.graph_objs as go
			
 
				+import cufflinks
			
 
				+cufflinks.go_offline()
			
 
				+
			
 
				+
			
 
				+def make_hist(df, x, category=None):
			
 
				+    """
			
 
				+    Make an interactive histogram, optionally segmented by `category`
			
 
				+
			
 
				+    :param df: dataframe of data
			
 
				+    :param x: string of column to use for plotting
			
 
				+    :param category: string representing column to segment by
			
 
				+
			
 
				+    :return figure: a plotly histogram to show with iplot or plot
			
 
				+    """
			
 
				+    if category is not None:
			
 
				+        data = []
			
 
				+        for name, group in df.groupby(category):
			
 
				+            data.append(go.Histogram(dict(x=group[x], name=name)))
			
 
				+    else:
			
 
				+        data = [go.Histogram(dict(x=df[x]))]
			
 
				+
			
 
				+    layout = go.Layout(
			
 
				+        yaxis=dict(title="Count"),
			
 
				+        xaxis=dict(title=x.replace('_', ' ').title()),
			
 
				+        title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
			
 
				+        if category
			
 
				+        else f"{x.replace('_', ' ').title()} Distribution",
			
 
				+    )
			
 
				+
			
 
				+    figure = go.Figure(data=data, layout=layout)
			
 
				+    return figure
			
 
				+
			
 
				+
			
 
				+def make_cum_plot(df, y, category=None, ranges=False):
			
 
				+    """
			
 
				+    Make an interactive cumulative plot, optionally segmented by `category`
			
 
				+
			
 
				+    :param df: dataframe of data, must have a `published_date` column
			
 
				+    :param y: string of column to use for plotting or list of two strings for double y axis
			
 
				+    :param category: string representing column to segment by
			
 
				+    :param ranges: boolean for whether to add range slider and range selector
			
 
				+
			
 
				+    :return figure: a plotly plot to show with iplot or plot
			
 
				+    """
			
 
				+    if category is not None:
			
 
				+        data = []
			
 
				+        for i, (name, group) in enumerate(df.groupby(category)):
			
 
				+            group.sort_values("published_date", inplace=True)
			
 
				+            data.append(
			
 
				+                go.Scatter(
			
 
				+                    x=group["published_date"],
			
 
				+                    y=group[y].cumsum(),
			
 
				+                    mode="lines+markers",
			
 
				+                    text=group["title"],
			
 
				+                    name=name,
			
 
				+                    marker=dict(size=10, opacity=0.8,
			
 
				+                                symbol=i + 2),
			
 
				+                )
			
 
				+            )
			
 
				+    else:
			
 
				+        df.sort_values("published_date", inplace=True)
			
 
				+        if len(y) == 2:
			
 
				+            data = [
			
 
				+                go.Scatter(
			
 
				+                    x=df["published_date"],
			
 
				+                    y=df[y[0]].cumsum(),
			
 
				+                    name=y[0].title(),
			
 
				+                    mode="lines+markers",
			
 
				+                    text=df["title"],
			
 
				+                    marker=dict(size=10, color='blue', opacity=0.6, line=dict(color='black'),
			
 
				+                                )),
			
 
				+                go.Scatter(
			
 
				+                    x=df["published_date"],
			
 
				+                    y=df[y[1]].cumsum(),
			
 
				+                    yaxis='y2',
			
 
				+                    name=y[1].title(),
			
 
				+                    mode="lines+markers",
			
 
				+                    text=df["title"],
			
 
				+                    marker=dict(size=10, color='red', opacity=0.6, line=dict(color='black'),
			
 
				+                                )),
			
 
				+            ]
			
 
				+        else:
			
 
				+            data = [
			
 
				+                go.Scatter(
			
 
				+                    x=df["published_date"],
			
 
				+                    y=df[y].cumsum(),
			
 
				+                    mode="lines+markers",
			
 
				+                    text=df["title"],
			
 
				+                    marker=dict(size=12, color='blue', opacity=0.6, line=dict(color='black'),
			
 
				+                                ),
			
 
				+                )
			
 
				+            ]
			
 
				+    if len(y) == 2:
			
 
				+        layout = go.Layout(
			
 
				+            xaxis=dict(title="Published Date", type="date"),
			
 
				+            yaxis=dict(title=y[0].replace('_', ' ').title(), color='blue'),
			
 
				+            yaxis2=dict(title=y[1].replace('_', ' ').title(), color='red',
			
 
				+                        overlaying='y', side='right'),
			
 
				+            font=dict(size=14),
			
 
				+            title=f"Cumulative {y[0].title()} and {y[1].title()}",
			
 
				+        )
			
 
				+    else:
			
 
				+        layout = go.Layout(
			
 
				+            xaxis=dict(title="Published Date", type="date"),
			
 
				+            yaxis=dict(title=y.replace('_', ' ').title()),
			
 
				+            font=dict(size=14),
			
 
				+            title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
			
 
				+            if category is not None
			
 
				+            else f"Cumulative {y.replace('_', ' ').title()}",
			
 
				+        )
			
 
				+
			
 
				+    # Add a rangeselector and rangeslider for a data xaxis
			
 
				+    if ranges:
			
 
				+        rangeselector = dict(
			
 
				+            buttons=list(
			
 
				+                [
			
 
				+                    dict(count=1, label="1m", step="month", stepmode="backward"),
			
 
				+                    dict(count=6, label="6m", step="month", stepmode="backward"),
			
 
				+                    dict(count=1, label="1y", step="year", stepmode="backward"),
			
 
				+                    dict(step="all"),
			
 
				+                ]
			
 
				+            )
			
 
				+        )
			
 
				+        rangeslider = dict(visible=True)
			
 
				+        layout["xaxis"]["rangeselector"] = rangeselector
			
 
				+        layout["xaxis"]["rangeslider"] = rangeslider
			
 
				+        layout['width'] = 1000
			
 
				+        layout['height'] = 600
			
 
				+
			
 
				+    figure = go.Figure(data=data, layout=layout)
			
 
				+    return figure
			
 
				+
			
 
				+
			
 
				+def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None, ranges=False, title_override=None):
			
 
				+    """
			
 
				+    Make an interactive scatterplot, optionally segmented by `category`
			
 
				+
			
 
				+    :param df: dataframe of data
			
 
				+    :param x: string of column to use for xaxis
			
 
				+    :param y: string of column to use for yaxis
			
 
				+    :param fits: list of strings of fits
			
 
				+    :param xlog: boolean for making a log xaxis
			
 
				+    :param ylog boolean for making a log yaxis
			
 
				+    :param category: string representing categorical column to segment by, this must be a categorical
			
 
				+    :param scale: string representing numerical column to size and color markers by, this must be numerical data
			
 
				+    :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
			
 
				+    :param annotations: text to display on the plot (dictionary)
			
 
				+    :param ranges: boolean for whether to add a range slider and selector
			
 
				+    :param title_override: String to override the title
			
 
				+
			
 
				+    :return figure: a plotly plot to show with iplot or plot
			
 
				+    """
			
 
				+    if category is not None:
			
 
				+        title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
			
 
				+        data = []
			
 
				+        for i, (name, group) in enumerate(df.groupby(category)):
			
 
				+            data.append(go.Scatter(x=group[x],
			
 
				+                                   y=group[y],
			
 
				+                                   mode='markers',
			
 
				+                                   text=group['title'],
			
 
				+                                   name=name,
			
 
				+                                   marker=dict(size=8, symbol=i + 2)))
			
 
				+
			
 
				+    else:
			
 
				+        if scale is not None:
			
 
				+            title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
			
 
				+            data = [go.Scatter(x=df[x],
			
 
				+                               y=df[y],
			
 
				+                               mode='markers',
			
 
				+                               text=df['title'], marker=dict(size=df[scale],
			
 
				+                                                             line=dict(color='black', width=0.5), sizemode='area', sizeref=sizeref, opacity=0.8,
			
 
				+                                                             colorscale='Viridis', color=df[scale], showscale=True, sizemin=2))]
			
 
				+        else:
			
 
				+
			
 
				+            df.sort_values(x, inplace=True)
			
 
				+            title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
			
 
				+            data = [go.Scatter(x=df[x],
			
 
				+                               y=df[y],
			
 
				+                               mode='markers',
			
 
				+                               text=df['title'], marker=dict(
			
 
				+                size=12, color='blue', opacity=0.8, line=dict(color='black')),
			
 
				+                name='observations')]
			
 
				+            if fits is not None:
			
 
				+                for fit in fits:
			
 
				+                    data.append(go.Scatter(x=df[x], y=df[fit], text=df['title'],
			
 
				+                                           mode='lines+markers', marker=dict
			
 
				+                                           (size=8, opacity=0.6),
			
 
				+                                           line=dict(dash='dash'), name=fit))
			
 
				+
			
 
				+                title += ' with Fit'
			
 
				+    layout = go.Layout(annotations=annotations,
			
 
				+                       xaxis=dict(title=x.replace('_', ' ').title() + (' (log scale)' if xlog else ''),
			
 
				+                                  type='log' if xlog else None),
			
 
				+                       yaxis=dict(title=y.replace('_', ' ').title() + (' (log scale)' if ylog else ''),
			
 
				+                                  type='log' if ylog else None),
			
 
				+                       font=dict(size=14),
			
 
				+                       title=title if title_override is None else title_override,
			
 
				+                       )
			
 
				+
			
 
				+    # Add a rangeselector and rangeslider for a data xaxis
			
 
				+    if ranges:
			
 
				+        rangeselector = dict(
			
 
				+            buttons=list(
			
 
				+                [
			
 
				+                    dict(count=1, label="1m", step="month", stepmode="backward"),
			
 
				+                    dict(count=6, label="6m", step="month", stepmode="backward"),
			
 
				+                    dict(count=1, label="1y", step="year", stepmode="backward"),
			
 
				+                    dict(step="all"),
			
 
				+                ]
			
 
				+            )
			
 
				+        )
			
 
				+        rangeslider = dict(visible=True)
			
 
				+        layout["xaxis"]["rangeselector"] = rangeselector
			
 
				+        layout["xaxis"]["rangeslider"] = rangeslider
			
 
				+        layout['width'] = 1000
			
 
				+        layout['height'] = 600
			
 
				+
			
 
				+    figure = go.Figure(data=data, layout=layout)
			
 
				+    return figure
			
 
				+
			
 
				+
			
 
				+def make_linear_regression(df, x, y, intercept_0):
			
 
				+    """
			
 
				+    Create a linear regression, either with the intercept set to 0 or
			
 
				+    the intercept allowed to be fitted
			
 
				+
			
 
				+    :param df: dataframe with data
			
 
				+    :param x: string or list of stringsfor the name of the column with x data
			
 
				+    :param y: string for the name of the column with y data
			
 
				+    :param intercept_0: boolean indicating whether to set the intercept to 0
			
 
				+    """
			
 
				+    if isinstance(x, list):
			
 
				+        lin_model = LinearRegression()
			
 
				+        lin_model.fit(df[x], df[y])
			
 
				+
			
 
				+        slopes, intercept, = lin_model.coef_, lin_model.intercept_
			
 
				+        df['predicted'] = lin_model.predict(df[x])
			
 
				+        r2 = lin_model.score(df[x], df[y])
			
 
				+        rmse = np.sqrt(mean_squared_error(
			
 
				+            y_true=df[y], y_pred=df['predicted']))
			
 
				+        equation = f'{y.replace("_", " ")} ='
			
 
				+
			
 
				+        names = ['r2', 'rmse', 'intercept']
			
 
				+        values = [r2, rmse, intercept]
			
 
				+        for i, (p, s) in enumerate(zip(x, slopes)):
			
 
				+            if (i + 1) % 3 == 0:
			
 
				+                equation += f'<br>{s:.2f} * {p.replace("_", " ")} +'
			
 
				+            else:
			
 
				+                equation += f' {s:.2f} * {p.replace("_", " ")} +'
			
 
				+            names.append(p)
			
 
				+            values.append(s)
			
 
				+
			
 
				+        equation += f' {intercept:.2f}'
			
 
				+        annotations = [dict(x=0.4 * df.index.max(), y=0.9 * df[y].max(), showarrow=False,
			
 
				+                            text=equation,
			
 
				+                            font=dict(size=10))]
			
 
				+
			
 
				+        df['index'] = list(df.index)
			
 
				+        figure = make_scatter_plot(df, x='index', y=y, fits=[
			
 
				+                                   'predicted'], annotations=annotations)
			
 
				+        summary = pd.DataFrame({'name': names, 'value': values})
			
 
				+    else:
			
 
				+        if intercept_0:
			
 
				+            lin_reg = sm.OLS(df[y], df[x]).fit()
			
 
				+            df['fit_values'] = lin_reg.fittedvalues
			
 
				+            summary = lin_reg.summary()
			
 
				+            slope = float(lin_reg.params)
			
 
				+            equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')}$"
			
 
				+
			
 
				+        else:
			
 
				+            lin_reg = stats.linregress(df[x], df[y])
			
 
				+            intercept, slope = lin_reg.intercept, lin_reg.slope
			
 
				+            params = ['pvalue', 'rvalue', 'slope', 'intercept']
			
 
				+            values = []
			
 
				+            for p in params:
			
 
				+                values.append(getattr(lin_reg, p))
			
 
				+            summary = pd.DataFrame({'param': params, 'value': values})
			
 
				+            df['fit_values'] = df[x] * slope + intercept
			
 
				+            equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
			
 
				+
			
 
				+        annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
			
 
				+                            text=equation,
			
 
				+                            font=dict(size=32))]
			
 
				+        figure = make_scatter_plot(
			
 
				+            df, x=x, y=y, fits=['fit_values'], annotations=annotations)
			
 
				+    return figure, summary
			
 
				+
			
 
				+
			
 
				+def make_poly_fits(df, x, y, degree=6):
			
 
				+    """
			
 
				+    Generate fits and make interactive plot with fits
			
 
				+
			
 
				+    :param df: dataframe with data
			
 
				+    :param x: string representing x data column
			
 
				+    :param y: string representing y data column
			
 
				+    :param degree: integer degree of fits to go up to
			
 
				+
			
 
				+    :return fit_stats: dataframe with information about fits
			
 
				+    :return figure: interactive plotly figure that can be shown with iplot or plot
			
 
				+    """
			
 
				+
			
 
				+    # Don't want to alter original data frame
			
 
				+    df = df.copy()
			
 
				+    fit_list = []
			
 
				+    rmse = []
			
 
				+    fit_params = []
			
 
				+
			
 
				+    # Make each fit
			
 
				+    for i in range(1, degree + 1):
			
 
				+        fit_name = f'fit degree = {i}'
			
 
				+        fit_list.append(fit_name)
			
 
				+        z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
			
 
				+        fit_params.append(z)
			
 
				+        df.loc[:, fit_name] = np.poly1d(z)(df[x])
			
 
				+        rmse.append(np.sqrt(res[0]))
			
 
				+
			
 
				+    fit_stats = pd.DataFrame(
			
 
				+        {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
			
 
				+    figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
			
 
				+    return figure, fit_stats
			
 
				+
			
 
				+
			
 
				+def make_extrapolation(df, y, years, degree=4):
			
 
				+    """
			
 
				+    Extrapolate `y` into the future `years` with `degree`  polynomial fit
			
 
				+
			
 
				+    :param df: dataframe of data
			
 
				+    :param y: string of column to extrapolate
			
 
				+    :param years: number of years to extrapolate into the future
			
 
				+    :param degree: integer degree of polynomial fit
			
 
				+
			
 
				+    :return figure: plotly figure for display using iplot or plot
			
 
				+    :return future_df: extrapolated numbers into the future
			
 
				+    """
			
 
				+
			
 
				+    df = df.copy()
			
 
				+    x = 'days_since_start'
			
 
				+    df['days_since_start'] = (
			
 
				+        (df['published_date'] - df['published_date'].min()).
			
 
				+        dt.total_seconds() / (3600 * 24)).astype(int)
			
 
				+
			
 
				+    cumy = f'cum_{y}'
			
 
				+    df[cumy] = df.sort_values(x)[y].cumsum()
			
 
				+
			
 
				+    figure, summary = make_poly_fits(df, x, cumy, degree=degree)
			
 
				+
			
 
				+    min_date = df['published_date'].min()
			
 
				+    max_date = df['published_date'].max()
			
 
				+
			
 
				+    date_range = pd.date_range(start=min_date,
			
 
				+                               end=max_date + pd.Timedelta(days=int(years * 365)))
			
 
				+
			
 
				+    future_df = pd.DataFrame({'date': date_range})
			
 
				+
			
 
				+    future_df[x] = (
			
 
				+        (future_df['date'] - future_df['date'].min()).
			
 
				+        dt.total_seconds() / (3600 * 24)).astype(int)
			
 
				+
			
 
				+    newcumy = f'cumulative_{y}'
			
 
				+
			
 
				+    future_df = future_df.merge(df[[x, cumy]], on=x, how='left').\
			
 
				+        rename(columns={cumy: newcumy})
			
 
				+
			
 
				+    z = np.poly1d(summary.iloc[-1]['params'])
			
 
				+    pred_name = f'predicted_{y}'
			
 
				+    future_df[pred_name] = z(future_df[x])
			
 
				+    future_df['title'] = ''
			
 
				+
			
 
				+    last_date = future_df.loc[future_df['date'].idxmax()]
			
 
				+    prediction_text = (
			
 
				+        f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}.")
			
 
				+    annotations = [dict(x=future_df['date'].quantile(0.4),
			
 
				+                        y=0.8 * future_df[pred_name].max(), text=prediction_text, showarrow=False,
			
 
				+                        font=dict(size=16))]
			
 
				+
			
 
				+    title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
			
 
				+
			
 
				+    figure = make_scatter_plot(future_df, 'date', newcumy, fits=[
			
 
				+                               pred_name], annotations=annotations, ranges=True, title_override=title_override)
			
 
				+    return figure, future_df
			
--- a/Introduction.ipynb
+++ b/Introduction.ipynb
--- a/plotly/building_one.csv
+++ b/plotly/building_one.csv
--- a/plotly/data/blue_jays.csv
+++ b/plotly/data/blue_jays.csv
@@ -0,0 +1,124 @@
 
				+"","BirdID","KnownSex","BillDepth","BillWidth","BillLength","Head","Mass","Skull","Sex"
			
 
				+"1","0000-00000","M",8.26,9.21,25.92,56.58,73.3,30.66,1
			
 
				+"2","1142-05901","M",8.54,8.76,24.99,56.36,75.1,31.38,1
			
 
				+"3","1142-05905","M",8.39,8.78,26.07,57.32,70.25,31.25,1
			
 
				+"4","1142-05907","F",7.78,9.3,23.48,53.77,65.5,30.29,0
			
 
				+"5","1142-05909","M",8.71,9.84,25.47,57.32,74.9,31.85,1
			
 
				+"6","1142-05911","F",7.28,9.3,22.25,52.25,63.9,30,0
			
 
				+"7","1142-05912","M",8.74,9.28,25.35,57.12,75.1,31.77,1
			
 
				+"8","1142-05914","M",8.72,9.94,30,60.67,78.1,30.67,1
			
 
				+"9","1142-05917","F",8.2,9.01,22.78,52.83,64,30.05,0
			
 
				+"10","1142-05920","F",7.67,9.31,24.61,54.94,67.33,30.33,0
			
 
				+"11","1142-05930","M",8.78,8.83,25.72,56.54,76.4,30.82,1
			
 
				+"12","1142-05941","F",8.15,8.67,24.66,54.69,71.5,30.03,0
			
 
				+"13","1142-05957","M",8.62,9.28,24.5,56.48,78.2,31.98,1
			
 
				+"14","1142-05971","F",7.65,9.11,23.93,55.75,73.4,31.83,0
			
 
				+"15","1142-05981","M",7.96,9.31,25.62,57.34,77,31.73,1
			
 
				+"16","1142-05986","F",8.17,8.49,23.15,53.05,65,29.9,0
			
 
				+"17","1142-05990","F",8.13,8.88,25,54.81,75.2,29.81,0
			
 
				+"18","1142-05991","M",8.19,9.98,25.4,58.7,75.7,33.3,1
			
 
				+"19","1142-05995","M",8.49,8.81,25.93,57.07,78.4,31.15,1
			
 
				+"20","1142-05996","M",8.35,9.57,24.4,55.79,70.6,31.39,1
			
 
				+"21","1142-05997","M",8.53,9.71,26.3,58.31,74.7,32,1
			
 
				+"22","1142-05998","M",8.07,8.38,23.62,54.51,70,30.89,1
			
 
				+"23","1142-05999","M",8.23,10.31,26.66,58.21,75.4,31.55,1
			
 
				+"24","702-90556","F",7.86,9.21,23.87,54.09,75.2,30.22,0
			
 
				+"25","702-90560","M",8.42,9.3,24.98,56.8,74.5,31.83,1
			
 
				+"26","702-90567","F",8.48,9.64,25.4,56.82,62.2,31.42,0
			
 
				+"27","702-90576","F",8.22,9.09,23,55.39,72.3,32.39,0
			
 
				+"28","702-90577","F",8.18,9.44,23.44,54.21,74,30.77,0
			
 
				+"29","702-90578","M",8.23,9.67,24.86,56.88,73.73,32.02,1
			
 
				+"30","702-90583","F",8.44,9.19,26.14,56.37,73.97,30.24,0
			
 
				+"31","872-94671","F",7.21,8.21,23.13,53.7,66.8,30.57,0
			
 
				+"32","872-94673","M",8.88,9.58,26.19,55.96,75.18,29.77,1
			
 
				+"33","872-94684","F",8.58,10.63,24.75,55.83,78.3,31.08,0
			
 
				+"34","872-94688","F",8.36,9.63,24.92,55.47,70.6,30.55,0
			
 
				+"35","872-94689","M",8.2,8.68,24.24,55.96,72.1,31.72,1
			
 
				+"36","872-94692","M",8.42,10.24,25.4,56.59,70.2,31.19,1
			
 
				+"37","872-94694","M",8.6,8.4,25.5,57.5,74,32,1
			
 
				+"38","872-94698","M",7.96,8.62,24.55,56.01,69.67,31.46,1
			
 
				+"39","872-94709","F",7.58,10,23.42,54.49,69.4,31.07,0
			
 
				+"40","872-94716","F",8.05,9.35,24.46,56.04,77.6,31.58,0
			
 
				+"41","872-94731","M",8.34,8.62,23.92,53.74,60.17,29.82,1
			
 
				+"42","872-94737","F",7.69,9.01,23.31,54.13,70.5,30.82,0
			
 
				+"43","872-94757","M",8.88,9.22,25.68,56.8,67.5,31.12,1
			
 
				+"44","872-94761","M",8.55,9.03,26.55,57.86,75.95,31.31,1
			
 
				+"45","872-94766","F",8.42,9.49,26.3,56.5,79.5,30.2,0
			
 
				+"46","872-94769","M",8.54,8.96,25.73,56.82,77.5,31.09,1
			
 
				+"47","872-94771","M",8.54,9.4,24.29,56.35,79.2,32.06,1
			
 
				+"48","872-94776","F",7.57,9.32,23.48,54.31,65.3,30.83,0
			
 
				+"49","872-94777","M",8.4,8.2,25.7,56.4,69,30.7,1
			
 
				+"50","872-94779","F",8.11,8.37,24.07,53.04,65.6,28.97,0
			
 
				+"51","872-94780","F",8.19,9.38,24.93,55.58,67.83,30.64,0
			
 
				+"52","952-00002","F",8.46,9.69,25.3,56.84,75.7,31.54,0
			
 
				+"53","952-00004","M",8.58,9.63,26.12,57,72.6,30.88,1
			
 
				+"54","952-00006","F",7.8,8.74,23.89,53.74,58.8,29.85,0
			
 
				+"55","952-00007","F",8.1,8.44,23.75,55.74,70.9,32,0
			
 
				+"56","952-00012","F",8.7,8.91,25.46,55.37,66.3,29.91,0
			
 
				+"57","952-00013","M",8.82,8.6,25.46,56.86,73.9,31.4,1
			
 
				+"58","952-00016","M",8.39,9.12,26.58,57.64,71.35,31.06,1
			
 
				+"59","952-00020","M",7.89,9.07,26.12,57.86,73.9,31.74,1
			
 
				+"60","952-00023","M",7.93,8.72,24.93,55.02,67.2,30.09,1
			
 
				+"61","952-00026","M",8.86,10.02,25.04,57.3,73,32.26,1
			
 
				+"62","952-00056","M",9,9.4,25.5,56.9,74,31.4,1
			
 
				+"63","952-00057","F",8,8.9,23.6,52.8,63.7,29.2,0
			
 
				+"64","952-00058","F",8.2,9,25,54.2,72.8,29.2,0
			
 
				+"65","952-00059","M",8.7,9.8,25.6,57.3,76.3,31.7,1
			
 
				+"66","952-00062","F",7.8,8,23.1,53.9,66,30.8,0
			
 
				+"67","952-00063","M",8.2,9.9,24.4,56.7,74,32.3,1
			
 
				+"68","952-00064","M",8.2,9.5,24,56.3,76.3,32.3,1
			
 
				+"69","952-00065","M",8.6,9.6,23.7,55.5,71.3,31.8,1
			
 
				+"70","952-00066","F",7.3,9.9,22.4,53.9,65,31.5,0
			
 
				+"71","952-00068","M",8.4,9.4,25,56.1,73.8,31.1,1
			
 
				+"72","952-00069","F",8,9.3,23.4,54,69,30.6,0
			
 
				+"73","952-00070","M",8.3,9.2,24.3,54.9,72.1,30.6,1
			
 
				+"74","952-00071","M",8.6,9.8,26,59.2,80.9,33.2,1
			
 
				+"75","952-00072","F",8.3,9,25.7,55,68.5,29.3,0
			
 
				+"76","952-00073","F",8.2,9.7,24.6,54.4,70,29.8,0
			
 
				+"77","952-00074","M",8.5,9.6,25.5,56.7,75.9,31.2,1
			
 
				+"78","952-00076","M",8.2,9.3,24.9,55.5,70,30.6,1
			
 
				+"79","952-00077","F",8.3,8.2,23.7,53.6,69,29.9,0
			
 
				+"80","952-00078","M",8.8,9.7,25.3,56.6,75.8,31.3,1
			
 
				+"81","952-00079","M",8.2,8.1,25.9,57.5,69.7,31.6,1
			
 
				+"82","952-00080","M",8.8,9.8,27.3,56.2,65.5,28.9,1
			
 
				+"83","952-00081","F",7.6,8.3,24.8,56.2,67.9,31.4,0
			
 
				+"84","952-00084","F",8.8,9.7,25.5,56.7,81.5,31.2,0
			
 
				+"85","962-62003","M",8.56,9.2,26.62,56.48,74,29.86,1
			
 
				+"86","962-62006","M",8.92,9.22,24.78,56.45,71.5,31.67,1
			
 
				+"87","962-62007","F",7.74,9.27,25.05,55.76,66.1,30.71,0
			
 
				+"88","962-62008","F",7.91,9.38,25.73,55.7,69.8,29.97,0
			
 
				+"89","962-62019","M",8.72,9.25,24.3,54.76,68.5,30.46,1
			
 
				+"90","962-62021","F",7.55,8.62,23.55,52.74,70,29.19,0
			
 
				+"91","962-62024","M",8.43,9.27,26.32,56.44,66.75,30.12,1
			
 
				+"92","962-62025","M",8.2,9.28,26.27,58.5,76.5,32.23,1
			
 
				+"93","962-62026","F",7.62,9.88,25.71,55.97,69.3,30.26,0
			
 
				+"94","962-62027","M",8.51,9.64,25.91,55.64,67.8,29.73,1
			
 
				+"95","962-62030","F",8.21,9.21,23.6,53.88,75.5,30.28,0
			
 
				+"96","962-62031","F",7.86,9.39,23.36,54.04,73.25,30.69,0
			
 
				+"97","962-62038","M",8.58,9.73,27.04,57.56,71.9,30.52,1
			
 
				+"98","962-62040","M",8.59,9.97,25.88,56,79.55,30.12,1
			
 
				+"99","962-62041","F",7.72,8.97,22.88,53.03,66.75,30.15,0
			
 
				+"100","962-62043","F",7.58,9.04,24.02,55.65,73.2,31.63,0
			
 
				+"101","962-62045","F",8.4,8.4,23.9,54.7,72.35,30.8,0
			
 
				+"102","962-62046","F",7.9,8.48,22.78,51.6,64,28.82,0
			
 
				+"103","962-62063","M",8.12,9.67,24.9,55.91,73.5,31.01,1
			
 
				+"104","962-62067","F",7.92,9.16,23.92,54.69,68.9,30.77,0
			
 
				+"105","962-62068","F",7.96,9.09,24.33,54.58,73.8,30.25,0
			
 
				+"106","962-62069","M",8.1,9.1,25.9,57.1,77.4,31.2,1
			
 
				+"107","962-62070","F",7.5,9.57,24.6,53.8,61.4,29.2,0
			
 
				+"108","962-62081","M",8.15,9.72,26.05,56.85,73.4,30.8,1
			
 
				+"109","962-62088","F",7.71,8.58,25.06,54.15,61.2,29.09,0
			
 
				+"110","962-62089","F",8.05,9.25,25.2,56.25,69.65,31.05,0
			
 
				+"111","962-62090","F",8.57,8.97,25.15,54.8,77,29.64,0
			
 
				+"112","962-62099","F",8.69,9.43,24.94,56.09,72.2,31.15,0
			
 
				+"113","962-62104","F",8.02,8.62,24.09,55.82,76.55,31.73,0
			
 
				+"114","962-62115","F",8.57,10.82,23.68,53.95,68.25,30.27,0
			
 
				+"115","962-62117","M",8.33,9.27,25.78,56.71,71.5,30.93,1
			
 
				+"116","962-62123","M",8.43,9.23,25.28,57.25,77.53,31.97,1
			
 
				+"117","962-62127","M",8.2,9.06,24.22,54.58,68.95,30.36,1
			
 
				+"118","962-62138","F",8.3,9.28,23.92,56.28,78.8,32.36,0
			
 
				+"119","962-62176","M",8.7,9.12,24.62,56.61,77,31.99,1
			
 
				+"120","962-62181","M",7.96,9.8,25.07,55.68,68,30.61,1
			
 
				+"121","962-62184","F",7.9,9.3,23.6,53.9,63.9,30.3,0
			
 
				+"122","962-62185","F",7.63,8.56,24.29,54.19,70.45,29.9,0
			
 
				+"123","962-62200","F",7.9,8,23,52.7,66,29.7,0
			
--- a/plotly/data/medium_data_2019_01_06
+++ b/plotly/data/medium_data_2019_01_06
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen
--- a/plotly/images/Screen
+++ b/plotly/images/Screen