Przeglądaj źródła

Done with notebook for article publication

WillKoehrsen 6 lat temu
rodzic
commit
5740637a7a

+ 2 - 0
.gitignore

@@ -1,2 +1,4 @@
+*.pyc
+*__pycache__
 *DS_Store
 medium/data/*_files

Plik diff jest za duży
+ 20883 - 37412
medium/Medium Stats Analysis.ipynb


Plik diff jest za duży
+ 6566 - 0
medium/Work In Progress.ipynb


Plik diff jest za duży
+ 3 - 3
medium/data/stats.html


BIN
medium/images/stats-saving-medium.gif


+ 18 - 5
medium/retrieval.py

@@ -29,7 +29,7 @@ def get_table_rows(fname='stats.html'):
 
 def convert_timestamp(ts: int, tz: str):
     """Convert a unix timestamp to a date timestamp"""
-    return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz)
+    return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
 
 
 def process_entry(entry, parallel=True, tz='America/Chicago'):
@@ -143,8 +143,8 @@ def process_entry(entry, parallel=True, tz='America/Chicago'):
     entry_dict['num_responses'] = num_responses
 
     # Time since publication
-    entry_dict['days_since_publication'] = (datetime.now(tz=pytz.timezone(
-        tz)) - entry_dict['published_date']).total_seconds() / (3600 * 24)
+    entry_dict['days_since_publication'] = (
+        datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
 
     return entry_dict
 
@@ -181,10 +181,18 @@ def process_in_parallel(table_rows, processes=20):
 
     # Convert to dataframe
     df = pd.DataFrame(results)
+    # Rename ratio
+    df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
     # Add extra columns with more data
     df['claps_per_word'] = df['claps'] / df['word_count']
-    df['edit_days'] = (df['published_date'] - df['started_date']
-                       ).dt.total_seconds() / (60 * 60 * 24)
+    df['editing_days'] = ((df['published_date'] - df['started_date']
+                           ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
+
+    # Rounding
+    df['published_date'] = df['published_date'].dt.round('min')
+    df['started_date'] = df['started_date'].dt.round('min')
+    df['read_ratio'] = df['read_ratio'].round(2)
+
     # 5 most common tags (might want to include more tags)
     n = 5
     all_tags = list(chain(*df['tags'].tolist()))
@@ -198,3 +206,8 @@ def process_in_parallel(table_rows, processes=20):
 
     df.sort_values('published_date', inplace=True)
     return df
+
+
+def get_data(fname='stats.html', processes=20):
+    t = get_table_rows(fname=fname)
+    return process_in_parallel(t, processes=processes)

+ 4 - 61
medium/visuals.py

@@ -14,63 +14,6 @@ import cufflinks
 cufflinks.go_offline()
 
 
-def make_update_menu(base_title, article_annotations=None, response_annotations=None):
-    """
-    Make an updatemenu for interative plot
-
-    :param base_title: string for title of plot
-
-    :return updatemenus: a updatemenus object for adding to a layout
-    """
-    updatemenus = list(
-        [
-            dict(
-                buttons=list(
-                    [
-                        dict(
-                            label="both",
-                            method="update",
-                            args=[
-                                dict(visible=[True, True]),
-                                dict(
-                                    title=base_title,
-                                    annotations=[
-                                        article_annotations,
-                                        response_annotations,
-                                    ],
-                                ),
-                            ],
-                        ),
-                        dict(
-                            label="articles",
-                            method="update",
-                            args=[
-                                dict(visible=[True, False]),
-                                dict(
-                                    title="Article " + base_title,
-                                    annotations=[article_annotations],
-                                ),
-                            ],
-                        ),
-                        dict(
-                            label="responses",
-                            method="update",
-                            args=[
-                                dict(visible=[False, True]),
-                                dict(
-                                    title="Response " + base_title,
-                                    annotations=[response_annotations],
-                                ),
-                            ],
-                        ),
-                    ]
-                )
-            )
-        ]
-    )
-    return updatemenus
-
-
 def make_hist(df, x, category=None):
     """
     Make an interactive histogram, optionally segmented by `category`
@@ -162,8 +105,8 @@ def make_cum_plot(df, y, category=None, ranges=False):
     if len(y) == 2:
         layout = go.Layout(
             xaxis=dict(title="Published Date", type="date"),
-            yaxis=dict(title=y[0].title(), color='blue'),
-            yaxis2=dict(title=y[1].title(), color='red',
+            yaxis=dict(title=y[0].replace('_', ' ').title(), color='blue'),
+            yaxis2=dict(title=y[1].replace('_', ' ').title(), color='red',
                         overlaying='y', side='right'),
             font=dict(size=14),
             title=f"Cumulative {y[0].title()} and {y[1].title()}",
@@ -368,7 +311,7 @@ def make_linear_regression(df, x, y, intercept_0):
             df['fit_values'] = lin_reg.fittedvalues
             summary = lin_reg.summary()
             slope = float(lin_reg.params)
-            equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')}$"
+            equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')}$"
 
         else:
             lin_reg = stats.linregress(df[x], df[y])
@@ -379,7 +322,7 @@ def make_linear_regression(df, x, y, intercept_0):
                 values.append(getattr(lin_reg, p))
             summary = pd.DataFrame({'param': params, 'value': values})
             df['fit_values'] = df[x] * slope + intercept
-            equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
+            equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
 
         annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
                             text=equation,