|
@@ -1,348 +0,0 @@
|
|
|
-# Data science imports
|
|
|
-import pandas as pd
|
|
|
-import numpy as np
|
|
|
-
|
|
|
-from scipy import stats
|
|
|
-
|
|
|
-# Options for pandas
|
|
|
-pd.options.display.max_columns = 20
|
|
|
-
|
|
|
-# Display all cell outputs
|
|
|
-from IPython.core.interactiveshell import InteractiveShell
|
|
|
-InteractiveShell.ast_node_interactivity = 'all'
|
|
|
-
|
|
|
-# Interactive plotting
|
|
|
-import plotly.plotly as py
|
|
|
-import plotly.graph_objs as go
|
|
|
-from plotly.offline import iplot
|
|
|
-import cufflinks
|
|
|
-cufflinks.go_offline()
|
|
|
-
|
|
|
-from timeit import default_timer as timer
|
|
|
-
|
|
|
-from collections import Counter, defaultdict
|
|
|
-from itertools import chain
|
|
|
-
|
|
|
-from bs4 import BeautifulSoup
|
|
|
-import re
|
|
|
-
|
|
|
-import requests
|
|
|
-from multiprocessing import Pool
|
|
|
-
|
|
|
-def get_links(soup):
|
|
|
- """
|
|
|
- Retrieve all links to entries on webpage
|
|
|
-
|
|
|
- :param soup: BeautifulSoup of HTML for page
|
|
|
- :return entry_links: list of links to entries
|
|
|
-
|
|
|
- """
|
|
|
- titles = soup.find_all(attrs = {'class': 'bq y br af bs ag db dc dd c de df dg'})
|
|
|
- pattern = re.compile('[0-9]{1,} min read')
|
|
|
- read_times = soup.find_all(text = pattern)
|
|
|
- read_times = [int(x.split(' ')[0]) for x in read_times]
|
|
|
- total_read_time = sum(read_times)
|
|
|
-
|
|
|
- print(f'Found {len(titles)} entries.')
|
|
|
- print(f'Total Read Time of Entries: {total_read_time} minutes.')
|
|
|
- entry_links = [title.a.get_attribute_list('href')[0] for title in titles]
|
|
|
-
|
|
|
- return entry_links
|
|
|
-
|
|
|
-def process_entry(link):
|
|
|
- """
|
|
|
- Retrieve data of single entry.
|
|
|
-
|
|
|
- :param link: string for link to entry
|
|
|
-
|
|
|
- :return entry_dict: dictionary of data about entry
|
|
|
- """
|
|
|
-
|
|
|
- entry_dict = {}
|
|
|
-
|
|
|
- # Retrieve the article and create a soup
|
|
|
- entry = requests.get(link).content
|
|
|
- entry_soup = BeautifulSoup(entry, features="lxml")
|
|
|
-
|
|
|
- # Publication time
|
|
|
- t = entry_soup.find_all('time')[0]
|
|
|
- t = pd.to_datetime(t.get('datetime'), utc=True).tz_convert('America/New_York')
|
|
|
-
|
|
|
- # Find the title header (determines if an article or a response)
|
|
|
- if entry_soup.h1 is not None:
|
|
|
- title = entry_soup.h1.text
|
|
|
- else:
|
|
|
- title = f'response-{t}'
|
|
|
-
|
|
|
- # Text as single long string
|
|
|
- entry_text = [p.text for p in entry_soup.find_all('p')]
|
|
|
- entry_text = ' '.join(entry_text)
|
|
|
-
|
|
|
- # Word count
|
|
|
- word_count = len(entry_text.split(' '))
|
|
|
-
|
|
|
- # Reading time in minutes
|
|
|
- read_time = entry_soup.find_all(attrs={'class': 'readingTime'})
|
|
|
- read_mins = int(read_time[0].get('title').split(' ')[0])
|
|
|
-
|
|
|
- # Number of claps
|
|
|
- clap_pattern = re.compile('^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
|
|
|
- claps = entry_soup.find_all(text = clap_pattern)
|
|
|
-
|
|
|
- if len(claps) > 0:
|
|
|
- if 'K' in claps[0]:
|
|
|
- clap_number = int(1e3 * float(claps[0].split('K')[0]))
|
|
|
- else:
|
|
|
- clap_number = int(claps[0].split(' ')[0])
|
|
|
- else:
|
|
|
- clap_number = 0
|
|
|
-
|
|
|
- # Post tags
|
|
|
- tags = entry_soup.find_all(attrs={'class': 'tags tags--postTags tags--borderless'})
|
|
|
- tags = [li.text for li in tags[0].find_all('li')]
|
|
|
-
|
|
|
- # Store in dictionary with title as key
|
|
|
- entry_dict['title'] = title
|
|
|
- entry_dict['text'] = entry_text
|
|
|
- entry_dict['word_count'] = word_count
|
|
|
- entry_dict['read_time'] = read_mins
|
|
|
- entry_dict['claps'] = clap_number
|
|
|
- entry_dict['time_published'] = t
|
|
|
- entry_dict['tags'] = tags
|
|
|
-
|
|
|
-
|
|
|
- return entry_dict
|
|
|
-
|
|
|
-def process_in_parallel(links, processes=20):
|
|
|
- """
|
|
|
- Process entries in parallel
|
|
|
-
|
|
|
- :param links: list of entry links
|
|
|
- :param processes: integer number of processes (threads) to use in parallel
|
|
|
-
|
|
|
- :return results: list of dictionaries of entry data
|
|
|
- """
|
|
|
- pool = Pool(processes=processes)
|
|
|
- results = []
|
|
|
-
|
|
|
- start = timer()
|
|
|
- for i, result in enumerate(pool.imap_unordered(process_entry, links)):
|
|
|
- if (i + 1) % 5 == 0:
|
|
|
- print(f'{100 * i / len(links):.2f}% complete.', end='\r')
|
|
|
- results.append(result)
|
|
|
-
|
|
|
- pool.close()
|
|
|
- pool.join()
|
|
|
- end = timer()
|
|
|
-
|
|
|
- print(f'Processed {len(results)} entries in {end-start:.0f} seconds.')
|
|
|
-
|
|
|
- # Add extra columns with more data
|
|
|
- df = pd.DataFrame.from_dict(results)
|
|
|
- df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
|
|
|
- df['claps_per_word'] = df['claps'] / df['word_count']
|
|
|
- df['words_per_minute'] = df['word_count'] / df['read_time']
|
|
|
-
|
|
|
- # Add 10 most common tags with flag if data has it
|
|
|
- n = 10
|
|
|
- all_tags = list(chain(*df['tags'].tolist()))
|
|
|
- tag_counts = Counter(all_tags)
|
|
|
- tags = tag_counts.most_common(n)
|
|
|
-
|
|
|
- for tag, count in tags:
|
|
|
- flag = [1 if tag in tags else 0 for tags in df['tags']]
|
|
|
- df.loc[:, f'<tag>{tag}'] = flag
|
|
|
-
|
|
|
- return df
|
|
|
-
|
|
|
-def make_update_menu(base_title, article_annotations=None, response_annotations=None):
|
|
|
- """
|
|
|
- Make an updatemenu for interative plot
|
|
|
-
|
|
|
- :param base_title: string for title of plot
|
|
|
-
|
|
|
- :return updatemenus: a updatemenus object for adding to a layout
|
|
|
- """
|
|
|
- updatemenus = list([
|
|
|
- dict(
|
|
|
- buttons=list([
|
|
|
- dict(
|
|
|
- label='both', method='update',
|
|
|
- args=[dict(visible=[True, True]), dict(title = base_title,
|
|
|
- annotations=[article_annotations,response_annotations])]),
|
|
|
- dict(
|
|
|
- label='articles',
|
|
|
- method='update',
|
|
|
- args=[dict(visible=[True, False]), dict(title = 'Article ' + base_title,
|
|
|
- annotations = [article_annotations])]),
|
|
|
- dict(
|
|
|
- label='responses',
|
|
|
- method='update',
|
|
|
- args=[dict(visible=[False, True]), dict(title='Response ' + base_title,
|
|
|
- annotations = [response_annotations])]),
|
|
|
- ]))
|
|
|
- ])
|
|
|
- return updatemenus
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
|
|
|
- """
|
|
|
- Make an interactive plot. Adds a dropdown to separate articles from responses
|
|
|
- if there are responses in the data. If there is only articles (or only responses)
|
|
|
- adds a linear regression line.
|
|
|
-
|
|
|
- :param data: dataframe of entry data
|
|
|
- :param x: string for xaxis of plot
|
|
|
- :param y: sring for yaxis of plot
|
|
|
- :param base_title: string for title of plot
|
|
|
- :param time: boolean for whether the xaxis is a plot
|
|
|
- :param eq_pos: position of equation for linear regression
|
|
|
-
|
|
|
- :return figure: an interactive plotly object for display
|
|
|
-
|
|
|
- """
|
|
|
-
|
|
|
- # Extract the relevant data
|
|
|
- responses = data[data['response'] == 'response'].copy()
|
|
|
- articles = data[data['response'] == 'article'].copy()
|
|
|
-
|
|
|
- if not responses.empty:
|
|
|
- # Create scatterplot data, articles must be first for menu selection
|
|
|
- plot_data = [
|
|
|
- go.Scatter(
|
|
|
- x=articles[x],
|
|
|
- y=articles[y],
|
|
|
- mode='markers',
|
|
|
- name='articles',
|
|
|
- text=articles['title'],
|
|
|
- marker=dict(color='blue', size=12)),
|
|
|
- go.Scatter(
|
|
|
- x=responses[x],
|
|
|
- y=responses[y],
|
|
|
- mode='markers',
|
|
|
- name='responses',
|
|
|
- marker=dict(color='green', size=12))
|
|
|
- ]
|
|
|
-
|
|
|
- if not time:
|
|
|
- annotations = {}
|
|
|
- for df, name in zip([articles, responses],
|
|
|
- ['articles', 'responses']):
|
|
|
-
|
|
|
- regression = stats.linregress(x=df[x], y=df[y])
|
|
|
- slope = regression.slope
|
|
|
- intercept = regression.intercept
|
|
|
- rvalue = regression.rvalue
|
|
|
-
|
|
|
- xi = np.array(range(int(df[x].min()), int(df[x].max())))
|
|
|
-
|
|
|
- line = xi*slope + intercept
|
|
|
- trace = go.Scatter(
|
|
|
- x=xi,
|
|
|
- y=line,
|
|
|
- mode='lines',
|
|
|
- marker=dict(color='blue' if name == 'articles' else 'green'),
|
|
|
- line=dict(width=4, dash='longdash'),
|
|
|
- name=f'{name} linear fit'
|
|
|
- )
|
|
|
-
|
|
|
- annotations[name] = dict(
|
|
|
- x=max(xi) * eq_pos[0],
|
|
|
- y=df[y].max() * eq_pos[1],
|
|
|
- showarrow=False,
|
|
|
- text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
|
|
|
- font=dict(size=16, color='blue' if name == 'articles' else 'green')
|
|
|
- )
|
|
|
-
|
|
|
- plot_data.append(trace)
|
|
|
-
|
|
|
- # Make a layout with update menus
|
|
|
- layout = go.Layout(annotations=list(annotations.values()),
|
|
|
- height=600,
|
|
|
- width=900,
|
|
|
- title=base_title,
|
|
|
- xaxis=dict(
|
|
|
- title=x.title(),
|
|
|
- tickfont=dict(size=14),
|
|
|
- titlefont=dict(size=16)),
|
|
|
- yaxis=dict(
|
|
|
- title=y.title(),
|
|
|
- tickfont=dict(size=14),
|
|
|
- titlefont=dict(size=16)),
|
|
|
- updatemenus=make_update_menu(base_title, annotations['articles'], annotations['responses']))
|
|
|
-
|
|
|
- # If there are only articles
|
|
|
- else:
|
|
|
- plot_data = [
|
|
|
- go.Scatter(
|
|
|
- x=data[x],
|
|
|
- y=data[y],
|
|
|
- mode='markers',
|
|
|
- name = 'observations',
|
|
|
- text=data['title'],
|
|
|
- marker=dict(color='blue', size=12))
|
|
|
- ]
|
|
|
-
|
|
|
- regression = stats.linregress(x=data[x], y=data[y])
|
|
|
- slope = regression.slope
|
|
|
- intercept = regression.intercept
|
|
|
- rvalue = regression.rvalue
|
|
|
-
|
|
|
- xi = np.array(range(int(data[x].min()), int(data[x].max())))
|
|
|
- line = xi*slope + intercept
|
|
|
- trace = go.Scatter(
|
|
|
- x=xi,
|
|
|
- y=line,
|
|
|
- mode='lines',
|
|
|
- marker=dict(color='red'),
|
|
|
- line=dict(width=4, dash='longdash'),
|
|
|
- name='linear fit'
|
|
|
- )
|
|
|
-
|
|
|
- annotations = [dict(
|
|
|
- x=max(xi) * eq_pos[0],
|
|
|
- y=data[y].max() * eq_pos[1],
|
|
|
- showarrow=False,
|
|
|
- text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
|
|
|
- font=dict(size=16)
|
|
|
- )]
|
|
|
-
|
|
|
- plot_data.append(trace)
|
|
|
-
|
|
|
- layout = go.Layout(annotations=annotations,
|
|
|
- height=600,
|
|
|
- width=900,
|
|
|
- title=base_title,
|
|
|
- xaxis=dict(
|
|
|
- title=x.title(),
|
|
|
- tickfont=dict(size=14),
|
|
|
- titlefont=dict(size=16)),
|
|
|
- yaxis=dict(
|
|
|
- title=y.title(),
|
|
|
- tickfont=dict(size=14),
|
|
|
- titlefont=dict(size=16)))
|
|
|
-
|
|
|
- # Add a rangeselector and rangeslider for a data xaxis
|
|
|
- if time:
|
|
|
- rangeselector = dict(
|
|
|
- buttons=list([
|
|
|
- dict(count=1, label='1m', step='month', stepmode='backward'),
|
|
|
- dict(count=6, label='6m', step='month', stepmode='backward'),
|
|
|
- dict(count=1, label='YTD', step='year', stepmode='todate'),
|
|
|
- dict(count=1, label='1y', step='year', stepmode='backward'),
|
|
|
- dict(step='all')
|
|
|
- ]))
|
|
|
- rangeslider = dict(visible=True)
|
|
|
- layout['xaxis']['rangeselector'] = rangeselector
|
|
|
- layout['xaxis']['rangeslider'] = rangeslider
|
|
|
-
|
|
|
- figure = go.Figure(data=plot_data, layout=layout)
|
|
|
-
|
|
|
- return figure
|
|
|
-
|
|
|
-
|
|
|
- # Return the figure
|
|
|
- figure = go.Figure(data=plot_data, layout=layout)
|
|
|
-
|
|
|
- return figure
|