6 anos atrás · 79125c2547
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
--- a/medium/retrieval.py
+++ b/medium/retrieval.py
@@ -89,6 +89,8 @@ def process_entry(entry, parallel=True, tz='America/Chicago'):
 
				     except:
			
 
				         title = 'response'
			
 
				 
			
 
				+    title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
			
 
				+
			
 
				     # Main text entries
			
 
				     entry_text = [p.text for p in entry_soup.find_all(
			
 
				         ['h1', 'h2', 'h3', 'p', 'blockquote'])]
			
@@ -133,6 +135,7 @@ def process_entry(entry, parallel=True, tz='America/Chicago'):
 
				 
			
 
				     # Store in dictionary
			
 
				     entry_dict['title'] = title
			
 
				+    entry_dict['title_word_count'] = title_word_count
			
 
				     entry_dict['text'] = entry_text
			
 
				     entry_dict['word_count'] = word_count
			
 
				     entry_dict['claps'] = clap_number
			
--- a/medium/visuals.py
+++ b/medium/visuals.py
@@ -1,24 +1,16 @@
 
				 # Data science imports
			
 
				-from multiprocessing import Pool
			
 
				-import requests
			
 
				-import re
			
 
				-from bs4 import BeautifulSoup
			
 
				-from itertools import chain
			
 
				-from collections import Counter, defaultdict
			
 
				-from timeit import default_timer as timer
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 import statsmodels.api as sm
			
 
				+from sklearn.linear_model import LinearRegression
			
 
				+from sklearn.metrics import mean_squared_error
			
 
				 
			
 
				 
			
 
				 from scipy import stats
			
 
				 
			
 
				 # Interactive plotting
			
 
				-import plotly.plotly as py
			
 
				 import plotly.graph_objs as go
			
 
				-from plotly.offline import iplot
			
 
				 import cufflinks
			
 
				-
			
 
				 cufflinks.go_offline()
			
 
				 
			
 
				 
			
@@ -108,13 +100,14 @@ def make_hist(df, x, category=None):
 
				     return figure
			
 
				 
			
 
				 
			
 
				-def make_cum_plot(df, y, category=None):
			
 
				+def make_cum_plot(df, y, category=None, ranges=False):
			
 
				     """
			
 
				     Make an interactive cumulative plot, optionally segmented by `category`
			
 
				 
			
 
				     :param df: dataframe of data, must have a `published_date` column
			
 
				     :param y: string of column to use for plotting or list of two strings for double y axis
			
 
				     :param category: string representing column to segment by
			
 
				+    :param ranges: boolean for whether to add range slider and range selector
			
 
				 
			
 
				     :return figure: a plotly plot to show with iplot or plot
			
 
				     """
			
@@ -185,11 +178,29 @@ def make_cum_plot(df, y, category=None):
 
				             else f"Cumulative {y.replace('_', ' ').title()}",
			
 
				         )
			
 
				 
			
 
				+    # Add a rangeselector and rangeslider for a data xaxis
			
 
				+    if ranges:
			
 
				+        rangeselector = dict(
			
 
				+            buttons=list(
			
 
				+                [
			
 
				+                    dict(count=1, label="1m", step="month", stepmode="backward"),
			
 
				+                    dict(count=6, label="6m", step="month", stepmode="backward"),
			
 
				+                    dict(count=1, label="1y", step="year", stepmode="backward"),
			
 
				+                    dict(step="all"),
			
 
				+                ]
			
 
				+            )
			
 
				+        )
			
 
				+        rangeslider = dict(visible=True)
			
 
				+        layout["xaxis"]["rangeselector"] = rangeselector
			
 
				+        layout["xaxis"]["rangeslider"] = rangeslider
			
 
				+        layout['width'] = 1000
			
 
				+        layout['height'] = 600
			
 
				+
			
 
				     figure = go.Figure(data=data, layout=layout)
			
 
				     return figure
			
 
				 
			
 
				 
			
 
				-def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None):
			
 
				+def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None, ranges=False, title_override=None):
			
 
				     """
			
 
				     Make an interactive scatterplot, optionally segmented by `category`
			
 
				 
			
@@ -203,6 +214,8 @@ def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None
 
				     :param scale: string representing numerical column to size and color markers by, this must be numerical data
			
 
				     :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
			
 
				     :param annotations: text to display on the plot (dictionary)
			
 
				+    :param ranges: boolean for whether to add a range slider and selector
			
 
				+    :param title_override: String to override the title
			
 
				 
			
 
				     :return figure: a plotly plot to show with iplot or plot
			
 
				     """
			
@@ -238,8 +251,9 @@ def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None
 
				                 name='observations')]
			
 
				             if fits is not None:
			
 
				                 for fit in fits:
			
 
				-                    data.append(go.Scatter(x=df[x], y=df[fit],
			
 
				-                                           mode='lines+markers', marker=dict(size=8, opacity=0.6),
			
 
				+                    data.append(go.Scatter(x=df[x], y=df[fit], text=df['title'],
			
 
				+                                           mode='lines+markers', marker=dict
			
 
				+                                           (size=8, opacity=0.6),
			
 
				                                            line=dict(dash='dash'), name=fit))
			
 
				 
			
 
				                 title += ' with Fit'
			
@@ -249,9 +263,27 @@ def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None
 
				                        yaxis=dict(title=y.replace('_', ' ').title() + (' (log scale)' if ylog else ''),
			
 
				                                   type='log' if ylog else None),
			
 
				                        font=dict(size=14),
			
 
				-                       title=title,
			
 
				+                       title=title if title_override is None else title_override,
			
 
				                        )
			
 
				 
			
 
				+    # Add a rangeselector and rangeslider for a data xaxis
			
 
				+    if ranges:
			
 
				+        rangeselector = dict(
			
 
				+            buttons=list(
			
 
				+                [
			
 
				+                    dict(count=1, label="1m", step="month", stepmode="backward"),
			
 
				+                    dict(count=6, label="6m", step="month", stepmode="backward"),
			
 
				+                    dict(count=1, label="1y", step="year", stepmode="backward"),
			
 
				+                    dict(step="all"),
			
 
				+                ]
			
 
				+            )
			
 
				+        )
			
 
				+        rangeslider = dict(visible=True)
			
 
				+        layout["xaxis"]["rangeselector"] = rangeselector
			
 
				+        layout["xaxis"]["rangeslider"] = rangeslider
			
 
				+        layout['width'] = 1000
			
 
				+        layout['height'] = 600
			
 
				+
			
 
				     figure = go.Figure(data=data, layout=layout)
			
 
				     return figure
			
 
				 
			
@@ -287,7 +319,7 @@ def make_poly_fits(df, x, y, degree=6):
 
				     fit_stats = pd.DataFrame(
			
 
				         {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
			
 
				     figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
			
 
				-    return fit_stats, figure
			
 
				+    return figure, fit_stats
			
 
				 
			
 
				 
			
 
				 def make_linear_regression(df, x, y, intercept_0):
			
@@ -296,204 +328,122 @@ def make_linear_regression(df, x, y, intercept_0):
 
				     the intercept allowed to be fitted
			
 
				 
			
 
				     :param df: dataframe with data
			
 
				-    :param x: string for the name of the column with x data
			
 
				+    :param x: string or list of stringsfor the name of the column with x data
			
 
				     :param y: string for the name of the column with y data
			
 
				     :param intercept_0: boolean indicating whether to set the intercept to 0
			
 
				     """
			
 
				-
			
 
				-    if intercept_0:
			
 
				-        lin_reg = sm.OLS(df[y], df[x]).fit()
			
 
				-        df['fit_values'] = lin_reg.fittedvalues
			
 
				-        summary = lin_reg.summary()
			
 
				-        slope = float(lin_reg.params)
			
 
				-        equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')}$"
			
 
				-
			
 
				+    if isinstance(x, list):
			
 
				+        lin_model = LinearRegression()
			
 
				+        lin_model.fit(df[x], df[y])
			
 
				+
			
 
				+        slopes, intercept, = lin_model.coef_, lin_model.intercept_
			
 
				+        df['predicted'] = lin_model.predict(df[x])
			
 
				+        r2 = lin_model.score(df[x], df[y])
			
 
				+        rmse = np.sqrt(mean_squared_error(
			
 
				+            y_true=df[y], y_pred=df['predicted']))
			
 
				+        equation = f'{y.replace("_", " ")} ='
			
 
				+
			
 
				+        names = ['r2', 'rmse', 'intercept']
			
 
				+        values = [r2, rmse, intercept]
			
 
				+        for i, (p, s) in enumerate(zip(x, slopes)):
			
 
				+            if (i + 1) % 3 == 0:
			
 
				+                equation += f'<br>{s:.2f} * {p.replace("_", " ")} +'
			
 
				+            else:
			
 
				+                equation += f' {s:.2f} * {p.replace("_", " ")} +'
			
 
				+            names.append(p)
			
 
				+            values.append(s)
			
 
				+
			
 
				+        equation += f' {intercept:.2f}'
			
 
				+        annotations = [dict(x=0.4 * df.index.max(), y=0.9 * df[y].max(), showarrow=False,
			
 
				+                            text=equation,
			
 
				+                            font=dict(size=10))]
			
 
				+
			
 
				+        df['index'] = list(df.index)
			
 
				+        figure = make_scatter_plot(df, x='index', y=y, fits=[
			
 
				+                                   'predicted'], annotations=annotations)
			
 
				+        summary = pd.DataFrame({'name': names, 'value': values})
			
 
				     else:
			
 
				-        lin_reg = stats.linregress(df[x], df[y])
			
 
				-        intercept, slope = lin_reg.intercept, lin_reg.slope
			
 
				-        params = ['pvalue', 'rvalue', 'slope', 'intercept']
			
 
				-        values = []
			
 
				-        for p in params:
			
 
				-            values.append(getattr(lin_reg, p))
			
 
				-        summary = pd.DataFrame({'param': params, 'value': values})
			
 
				-        df['fit_values'] = df[x] * slope + intercept
			
 
				-        equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
			
 
				-
			
 
				-    annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
			
 
				-                        text=equation,
			
 
				-                        font=dict(size=32))]
			
 
				-    figure = make_scatter_plot(
			
 
				-        df, x=x, y=y, fits=['fit_values'], annotations=annotations)
			
 
				+        if intercept_0:
			
 
				+            lin_reg = sm.OLS(df[y], df[x]).fit()
			
 
				+            df['fit_values'] = lin_reg.fittedvalues
			
 
				+            summary = lin_reg.summary()
			
 
				+            slope = float(lin_reg.params)
			
 
				+            equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')}$"
			
 
				+
			
 
				+        else:
			
 
				+            lin_reg = stats.linregress(df[x], df[y])
			
 
				+            intercept, slope = lin_reg.intercept, lin_reg.slope
			
 
				+            params = ['pvalue', 'rvalue', 'slope', 'intercept']
			
 
				+            values = []
			
 
				+            for p in params:
			
 
				+                values.append(getattr(lin_reg, p))
			
 
				+            summary = pd.DataFrame({'param': params, 'value': values})
			
 
				+            df['fit_values'] = df[x] * slope + intercept
			
 
				+            equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
			
 
				+
			
 
				+        annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
			
 
				+                            text=equation,
			
 
				+                            font=dict(size=32))]
			
 
				+        figure = make_scatter_plot(
			
 
				+            df, x=x, y=y, fits=['fit_values'], annotations=annotations)
			
 
				     return figure, summary
			
 
				 
			
 
				 
			
 
				-def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
			
 
				+def make_extrapolation(df, y, years, degree=4):
			
 
				     """
			
 
				-    Make an interactive plot. Adds a dropdown to separate articles from responses
			
 
				-    if there are responses in the data. If there is only articles (or only responses)
			
 
				-    adds a linear regression line.
			
 
				+    Extrapolate `y` into the future `years` with `degree`  polynomial fit
			
 
				 
			
 
				-    :param data: dataframe of entry data
			
 
				-    :param x: string for xaxis of plot
			
 
				-    :param y: sring for yaxis of plot
			
 
				-    :param base_title: string for title of plot
			
 
				-    :param time: boolean for whether the xaxis is a plot
			
 
				-    :param eq_pos: position of equation for linear regression
			
 
				-
			
 
				-    :return figure: an interactive plotly object for display
			
 
				+    :param df: dataframe of data
			
 
				+    :param y: string of column to extrapolate
			
 
				+    :param years: number of years to extrapolate into the future
			
 
				+    :param degree: integer degree of polynomial fit
			
 
				 
			
 
				+    :return figure: plotly figure for display using iplot or plot
			
 
				+    :return future_df: extrapolated numbers into the future
			
 
				     """
			
 
				 
			
 
				-    # Extract the relevant data
			
 
				-    responses = data[data["response"] == "response"].copy()
			
 
				-    articles = data[data["response"] == "article"].copy()
			
 
				-
			
 
				-    if not responses.empty:
			
 
				-        # Create scatterplot data, articles must be first for menu selection
			
 
				-        plot_data = [
			
 
				-            go.Scatter(
			
 
				-                x=articles[x],
			
 
				-                y=articles[y],
			
 
				-                mode="markers",
			
 
				-                name="articles",
			
 
				-                text=articles["title"],
			
 
				-                marker=dict(color="blue", size=12),
			
 
				-            ),
			
 
				-            go.Scatter(
			
 
				-                x=responses[x],
			
 
				-                y=responses[y],
			
 
				-                mode="markers",
			
 
				-                name="responses",
			
 
				-                marker=dict(color="green", size=12),
			
 
				-            ),
			
 
				-        ]
			
 
				+    df = df.copy()
			
 
				+    x = 'days_since_start'
			
 
				+    df['days_since_start'] = (
			
 
				+        (df['published_date'] - df['published_date'].min()).
			
 
				+        dt.total_seconds() / (3600 * 24)).astype(int)
			
 
				 
			
 
				-        if not time:
			
 
				-            annotations = {}
			
 
				-            for df, name in zip([articles, responses], ["articles", "responses"]):
			
 
				-
			
 
				-                regression = stats.linregress(x=df[x], y=df[y])
			
 
				-                slope = regression.slope
			
 
				-                intercept = regression.intercept
			
 
				-                rvalue = regression.rvalue
			
 
				-
			
 
				-                xi = np.array(range(int(df[x].min()), int(df[x].max())))
			
 
				-
			
 
				-                line = xi * slope + intercept
			
 
				-                trace = go.Scatter(
			
 
				-                    x=xi,
			
 
				-                    y=line,
			
 
				-                    mode="lines",
			
 
				-                    marker=dict(color="blue" if name ==
			
 
				-                                "articles" else "green"),
			
 
				-                    line=dict(width=4, dash="longdash"),
			
 
				-                    name=f"{name} linear fit",
			
 
				-                )
			
 
				+    cumy = f'cum_{y}'
			
 
				+    df[cumy] = df.sort_values(x)[y].cumsum()
			
 
				 
			
 
				-                annotations[name] = dict(
			
 
				-                    x=max(xi) * eq_pos[0],
			
 
				-                    y=df[y].max() * eq_pos[1],
			
 
				-                    showarrow=False,
			
 
				-                    text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
			
 
				-                    font=dict(size=16, color="blue" if name ==
			
 
				-                              "articles" else "green"),
			
 
				-                )
			
 
				+    figure, summary = make_poly_fits(df, x, cumy, degree=degree)
			
 
				 
			
 
				-                plot_data.append(trace)
			
 
				+    min_date = df['published_date'].min()
			
 
				+    max_date = df['published_date'].max()
			
 
				 
			
 
				-        # Make a layout with update menus
			
 
				-        layout = go.Layout(
			
 
				-            annotations=list(annotations.values()),
			
 
				-            height=600,
			
 
				-            width=900,
			
 
				-            title=base_title,
			
 
				-            xaxis=dict(
			
 
				-                title=x.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
			
 
				-            ),
			
 
				-            yaxis=dict(
			
 
				-                title=y.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
			
 
				-            ),
			
 
				-            updatemenus=make_update_menu(
			
 
				-                base_title, annotations["articles"], annotations["responses"]
			
 
				-            ),
			
 
				-        )
			
 
				+    date_range = pd.date_range(start=min_date,
			
 
				+                               end=max_date + pd.Timedelta(days=int(years * 365)))
			
 
				 
			
 
				-    # If there are only articles
			
 
				-    else:
			
 
				-        plot_data = [
			
 
				-            go.Scatter(
			
 
				-                x=data[x],
			
 
				-                y=data[y],
			
 
				-                mode="markers",
			
 
				-                name="observations",
			
 
				-                text=data["title"],
			
 
				-                marker=dict(color="blue", size=12),
			
 
				-            )
			
 
				-        ]
			
 
				+    future_df = pd.DataFrame({'date': date_range})
			
 
				 
			
 
				-        regression = stats.linregress(x=data[x], y=data[y])
			
 
				-        slope = regression.slope
			
 
				-        intercept = regression.intercept
			
 
				-        rvalue = regression.rvalue
			
 
				-
			
 
				-        xi = np.array(range(int(data[x].min()), int(data[x].max())))
			
 
				-        line = xi * slope + intercept
			
 
				-        trace = go.Scatter(
			
 
				-            x=xi,
			
 
				-            y=line,
			
 
				-            mode="lines",
			
 
				-            marker=dict(color="red"),
			
 
				-            line=dict(width=4, dash="longdash"),
			
 
				-            name="linear fit",
			
 
				-        )
			
 
				+    future_df[x] = (
			
 
				+        (future_df['date'] - future_df['date'].min()).
			
 
				+        dt.total_seconds() / (3600 * 24)).astype(int)
			
 
				 
			
 
				-        annotations = [
			
 
				-            dict(
			
 
				-                x=max(xi) * eq_pos[0],
			
 
				-                y=data[y].max() * eq_pos[1],
			
 
				-                showarrow=False,
			
 
				-                text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
			
 
				-                font=dict(size=16),
			
 
				-            )
			
 
				-        ]
			
 
				+    newcumy = f'cumulative_{y}'
			
 
				 
			
 
				-        plot_data.append(trace)
			
 
				+    future_df = future_df.merge(df[[x, cumy]], on=x, how='left').\
			
 
				+        rename(columns={cumy: newcumy})
			
 
				 
			
 
				-        layout = go.Layout(
			
 
				-            annotations=annotations,
			
 
				-            height=600,
			
 
				-            width=900,
			
 
				-            title=base_title,
			
 
				-            xaxis=dict(
			
 
				-                title=x.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
			
 
				-            ),
			
 
				-            yaxis=dict(
			
 
				-                title=y.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
			
 
				-            ),
			
 
				-        )
			
 
				+    z = np.poly1d(summary.iloc[-1]['params'])
			
 
				+    pred_name = f'predicted_{y}'
			
 
				+    future_df[pred_name] = z(future_df[x])
			
 
				+    future_df['title'] = ''
			
 
				 
			
 
				-    # Add a rangeselector and rangeslider for a data xaxis
			
 
				-    if time:
			
 
				-        rangeselector = dict(
			
 
				-            buttons=list(
			
 
				-                [
			
 
				-                    dict(count=1, label="1m", step="month", stepmode="backward"),
			
 
				-                    dict(count=6, label="6m", step="month", stepmode="backward"),
			
 
				-                    dict(count=1, label="YTD", step="year", stepmode="todate"),
			
 
				-                    dict(count=1, label="1y", step="year", stepmode="backward"),
			
 
				-                    dict(step="all"),
			
 
				-                ]
			
 
				-            )
			
 
				-        )
			
 
				-        rangeslider = dict(visible=True)
			
 
				-        layout["xaxis"]["rangeselector"] = rangeselector
			
 
				-        layout["xaxis"]["rangeslider"] = rangeslider
			
 
				-
			
 
				-        figure = go.Figure(data=plot_data, layout=layout)
			
 
				+    last_date = future_df.loc[future_df['date'].idxmax()]
			
 
				+    prediction_text = (
			
 
				+        f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}.")
			
 
				+    annotations = [dict(x=future_df['date'].quantile(0.4),
			
 
				+                        y=0.8 * future_df[pred_name].max(), text=prediction_text, showarrow=False,
			
 
				+                        font=dict(size=16))]
			
 
				 
			
 
				-        return figure
			
 
				+    title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
			
 
				 
			
 
				-    # Return the figure
			
 
				-    figure = go.Figure(data=plot_data, layout=layout)
			
 
				-
			
 
				-    return figure
			
 
				+    figure = make_scatter_plot(future_df, 'date', newcumy, fits=[
			
 
				+                               pred_name], annotations=annotations, ranges=True, title_override=title_override)
			
 
				+    return figure, future_df