utils.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. # Data science imports
  2. import pandas as pd
  3. import numpy as np
  4. from scipy import stats
  5. # Options for pandas
  6. pd.options.display.max_columns = 20
  7. # Display all cell outputs
  8. from IPython.core.interactiveshell import InteractiveShell
  9. InteractiveShell.ast_node_interactivity = 'all'
  10. # Interactive plotting
  11. import plotly.plotly as py
  12. import plotly.graph_objs as go
  13. from plotly.offline import iplot
  14. import cufflinks
  15. cufflinks.go_offline()
  16. from timeit import default_timer as timer
  17. from collections import Counter, defaultdict
  18. from itertools import chain
  19. from bs4 import BeautifulSoup
  20. import re
  21. import requests
  22. from multiprocessing import Pool
  23. def get_links(soup):
  24. """
  25. Retrieve all links to entries on webpage
  26. :param soup: BeautifulSoup of HTML for page
  27. :return entry_links: list of links to entries
  28. """
  29. titles = soup.find_all(attrs = {'class': 'bq y br af bs ag db dc dd c de df dg'})
  30. pattern = re.compile('[0-9]{1,} min read')
  31. read_times = soup.find_all(text = pattern)
  32. read_times = [int(x.split(' ')[0]) for x in read_times]
  33. total_read_time = sum(read_times)
  34. print(f'Found {len(titles)} entries.')
  35. print(f'Total Read Time of Entries: {total_read_time} minutes.')
  36. entry_links = [title.a.get_attribute_list('href')[0] for title in titles]
  37. return entry_links
  38. def process_entry(link):
  39. """
  40. Retrieve data of single entry.
  41. :param link: string for link to entry
  42. :return entry_dict: dictionary of data about entry
  43. """
  44. entry_dict = {}
  45. # Retrieve the article and create a soup
  46. entry = requests.get(link).content
  47. entry_soup = BeautifulSoup(entry, features="lxml")
  48. # Publication time
  49. t = entry_soup.find_all('time')[0]
  50. t = pd.to_datetime(t.get('datetime'), utc=True).tz_convert('America/New_York')
  51. # Find the title header (determines if an article or a response)
  52. if entry_soup.h1 is not None:
  53. title = entry_soup.h1.text
  54. else:
  55. title = f'response-{t}'
  56. # Text as single long string
  57. entry_text = [p.text for p in entry_soup.find_all('p')]
  58. entry_text = ' '.join(entry_text)
  59. # Word count
  60. word_count = len(entry_text.split(' '))
  61. # Reading time in minutes
  62. read_time = entry_soup.find_all(attrs={'class': 'readingTime'})
  63. read_mins = int(read_time[0].get('title').split(' ')[0])
  64. # Number of claps
  65. clap_pattern = re.compile('^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
  66. claps = entry_soup.find_all(text = clap_pattern)
  67. if len(claps) > 0:
  68. if 'K' in claps[0]:
  69. clap_number = int(1e3 * float(claps[0].split('K')[0]))
  70. else:
  71. clap_number = int(claps[0].split(' ')[0])
  72. else:
  73. clap_number = 0
  74. # Post tags
  75. tags = entry_soup.find_all(attrs={'class': 'tags tags--postTags tags--borderless'})
  76. tags = [li.text for li in tags[0].find_all('li')]
  77. # Store in dictionary with title as key
  78. entry_dict['title'] = title
  79. entry_dict['text'] = entry_text
  80. entry_dict['word_count'] = word_count
  81. entry_dict['read_time'] = read_mins
  82. entry_dict['claps'] = clap_number
  83. entry_dict['time_published'] = t
  84. entry_dict['tags'] = tags
  85. return entry_dict
  86. def process_in_parallel(links, processes=20):
  87. """
  88. Process entries in parallel
  89. :param links: list of entry links
  90. :param processes: integer number of processes (threads) to use in parallel
  91. :return results: list of dictionaries of entry data
  92. """
  93. pool = Pool(processes=processes)
  94. results = []
  95. start = timer()
  96. for i, result in enumerate(pool.imap_unordered(process_entry, links)):
  97. if (i + 1) % 5 == 0:
  98. print(f'{100 * i / len(links):.2f}% complete.', end='\r')
  99. results.append(result)
  100. pool.close()
  101. pool.join()
  102. end = timer()
  103. print(f'Processed {len(results)} entries in {end-start:.0f} seconds.')
  104. # Add extra columns with more data
  105. df = pd.DataFrame.from_dict(results)
  106. df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
  107. df['claps_per_word'] = df['claps'] / df['word_count']
  108. df['words_per_minute'] = df['word_count'] / df['read_time']
  109. # Add 10 most common tags with flag if data has it
  110. n = 10
  111. all_tags = list(chain(*df['tags'].tolist()))
  112. tag_counts = Counter(all_tags)
  113. tags = tag_counts.most_common(n)
  114. for tag, count in tags:
  115. flag = [1 if tag in tags else 0 for tags in df['tags']]
  116. df.loc[:, f'<tag>{tag}'] = flag
  117. return df
  118. def make_update_menu(base_title, article_annotations=None, response_annotations=None):
  119. """
  120. Make an updatemenu for interative plot
  121. :param base_title: string for title of plot
  122. :return updatemenus: a updatemenus object for adding to a layout
  123. """
  124. updatemenus = list([
  125. dict(
  126. buttons=list([
  127. dict(
  128. label='both', method='update',
  129. args=[dict(visible=[True, True]), dict(title = base_title,
  130. annotations=[article_annotations,response_annotations])]),
  131. dict(
  132. label='articles',
  133. method='update',
  134. args=[dict(visible=[True, False]), dict(title = 'Article ' + base_title,
  135. annotations = [article_annotations])]),
  136. dict(
  137. label='responses',
  138. method='update',
  139. args=[dict(visible=[False, True]), dict(title='Response ' + base_title,
  140. annotations = [response_annotations])]),
  141. ]))
  142. ])
  143. return updatemenus
  144. def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
  145. """
  146. Make an interactive plot. Adds a dropdown to separate articles from responses
  147. if there are responses in the data. If there is only articles (or only responses)
  148. adds a linear regression line.
  149. :param data: dataframe of entry data
  150. :param x: string for xaxis of plot
  151. :param y: sring for yaxis of plot
  152. :param base_title: string for title of plot
  153. :param time: boolean for whether the xaxis is a plot
  154. :param eq_pos: position of equation for linear regression
  155. :return figure: an interactive plotly object for display
  156. """
  157. # Extract the relevant data
  158. responses = data[data['response'] == 'response'].copy()
  159. articles = data[data['response'] == 'article'].copy()
  160. if not responses.empty:
  161. # Create scatterplot data, articles must be first for menu selection
  162. plot_data = [
  163. go.Scatter(
  164. x=articles[x],
  165. y=articles[y],
  166. mode='markers',
  167. name='articles',
  168. text=articles['title'],
  169. marker=dict(color='blue', size=12)),
  170. go.Scatter(
  171. x=responses[x],
  172. y=responses[y],
  173. mode='markers',
  174. name='responses',
  175. marker=dict(color='green', size=12))
  176. ]
  177. if not time:
  178. annotations = {}
  179. for df, name in zip([articles, responses],
  180. ['articles', 'responses']):
  181. regression = stats.linregress(x=df[x], y=df[y])
  182. slope = regression.slope
  183. intercept = regression.intercept
  184. rvalue = regression.rvalue
  185. xi = np.array(range(int(df[x].min()), int(df[x].max())))
  186. line = xi*slope + intercept
  187. trace = go.Scatter(
  188. x=xi,
  189. y=line,
  190. mode='lines',
  191. marker=dict(color='blue' if name == 'articles' else 'green'),
  192. line=dict(width=4, dash='longdash'),
  193. name=f'{name} linear fit'
  194. )
  195. annotations[name] = dict(
  196. x=max(xi) * eq_pos[0],
  197. y=df[y].max() * eq_pos[1],
  198. showarrow=False,
  199. text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
  200. font=dict(size=16, color='blue' if name == 'articles' else 'green')
  201. )
  202. plot_data.append(trace)
  203. # Make a layout with update menus
  204. layout = go.Layout(annotations=list(annotations.values()),
  205. height=600,
  206. width=900,
  207. title=base_title,
  208. xaxis=dict(
  209. title=x.title(),
  210. tickfont=dict(size=14),
  211. titlefont=dict(size=16)),
  212. yaxis=dict(
  213. title=y.title(),
  214. tickfont=dict(size=14),
  215. titlefont=dict(size=16)),
  216. updatemenus=make_update_menu(base_title, annotations['articles'], annotations['responses']))
  217. # If there are only articles
  218. else:
  219. plot_data = [
  220. go.Scatter(
  221. x=data[x],
  222. y=data[y],
  223. mode='markers',
  224. name = 'observations',
  225. text=data['title'],
  226. marker=dict(color='blue', size=12))
  227. ]
  228. regression = stats.linregress(x=data[x], y=data[y])
  229. slope = regression.slope
  230. intercept = regression.intercept
  231. rvalue = regression.rvalue
  232. xi = np.array(range(int(data[x].min()), int(data[x].max())))
  233. line = xi*slope + intercept
  234. trace = go.Scatter(
  235. x=xi,
  236. y=line,
  237. mode='lines',
  238. marker=dict(color='red'),
  239. line=dict(width=4, dash='longdash'),
  240. name='linear fit'
  241. )
  242. annotations = [dict(
  243. x=max(xi) * eq_pos[0],
  244. y=data[y].max() * eq_pos[1],
  245. showarrow=False,
  246. text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
  247. font=dict(size=16)
  248. )]
  249. plot_data.append(trace)
  250. layout = go.Layout(annotations=annotations,
  251. height=600,
  252. width=900,
  253. title=base_title,
  254. xaxis=dict(
  255. title=x.title(),
  256. tickfont=dict(size=14),
  257. titlefont=dict(size=16)),
  258. yaxis=dict(
  259. title=y.title(),
  260. tickfont=dict(size=14),
  261. titlefont=dict(size=16)))
  262. # Add a rangeselector and rangeslider for a data xaxis
  263. if time:
  264. rangeselector = dict(
  265. buttons=list([
  266. dict(count=1, label='1m', step='month', stepmode='backward'),
  267. dict(count=6, label='6m', step='month', stepmode='backward'),
  268. dict(count=1, label='YTD', step='year', stepmode='todate'),
  269. dict(count=1, label='1y', step='year', stepmode='backward'),
  270. dict(step='all')
  271. ]))
  272. rangeslider = dict(visible=True)
  273. layout['xaxis']['rangeselector'] = rangeselector
  274. layout['xaxis']['rangeslider'] = rangeslider
  275. figure = go.Figure(data=plot_data, layout=layout)
  276. return figure
  277. # Return the figure
  278. figure = go.Figure(data=plot_data, layout=layout)
  279. return figure