retrieval.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. from multiprocessing import Pool
  2. import requests
  3. import re
  4. from bs4 import BeautifulSoup
  5. from itertools import chain
  6. from collections import Counter
  7. from timeit import default_timer as timer
  8. import pandas as pd
  9. from datetime import datetime
  10. def get_table_rows(fname='stats.html'):
  11. """
  12. Extract the table rows from the statistics
  13. :param fname: string name of the file stored in `data` directory
  14. :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
  15. """
  16. soup = BeautifulSoup(open(f'data/{fname}', 'r'), features='lxml')
  17. table_rows = soup.find_all(
  18. attrs={'class': "sortableTable-row js-statsTableRow"})
  19. print(f'Found {len(table_rows)} entries in table.')
  20. return table_rows
  21. def convert_timestamp(ts: int, tz: str):
  22. """Convert a unix timestamp to a date timestamp"""
  23. return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
  24. def process_entry(entry, parallel=True, tz='America/Chicago'):
  25. """
  26. Extract data from one entry in table
  27. :param entry: BeautifulSoup tag
  28. :param parallel: Boolean for whether function is being run in parallel
  29. :param tz: string representing timezone for started and published time
  30. :return entry_dict: dictionary with data about entry
  31. """
  32. # Convert to soup when running in parallel
  33. if parallel:
  34. entry = BeautifulSoup(entry, features='lxml').body.tr
  35. entry_dict = {}
  36. # Extract information
  37. for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
  38. ['published_date', 'views', 'reads', 'ratio', 'fans']):
  39. entry_dict[key] = float(
  40. value.text) if key == 'ratio' else int(value.text)
  41. entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
  42. 0].get('title').split(' ')[0])
  43. # Unlisted vs published
  44. entry_dict['type'] = 'unlisted' if len(
  45. entry.find_all(text=' Unlisted')) > 0 else 'published'
  46. # Publication
  47. publication = entry.find_all(attrs={'class': 'sortableTable-text'})
  48. if 'In' in publication[0].text:
  49. entry_dict['publication'] = publication[0].text.split('In ')[
  50. 1].split('View')[0]
  51. else:
  52. entry_dict['publication'] = 'None'
  53. # Convert datetimes
  54. entry_dict['published_date'] = convert_timestamp(
  55. entry_dict['published_date'], tz=tz)
  56. entry_dict['started_date'] = convert_timestamp(
  57. entry.get('data-timestamp'), tz=tz)
  58. # Get the link
  59. link = entry.find_all(text='View story',
  60. attrs={'class': 'sortableTable-link'})[0].get('href')
  61. entry_dict['link'] = link
  62. # Retrieve the article and create a soup
  63. entry = requests.get(link).content
  64. entry_soup = BeautifulSoup(entry, features='lxml')
  65. # Get the title
  66. try:
  67. title = entry_soup.h1.text
  68. except:
  69. title = 'response'
  70. title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
  71. # Main text entries
  72. entry_text = [p.text for p in entry_soup.find_all(
  73. ['h1', 'h2', 'h3', 'p', 'blockquote'])]
  74. # Make sure to catch everything
  75. entry_text.extend(s.text for s in entry_soup.find_all(
  76. attrs={'class': 'graf graf--li graf-after--li'}))
  77. entry_text.extend(s.text for s in entry_soup.find_all(
  78. attrs={'class': 'graf graf--li graf-after--p'}))
  79. entry_text.extend(s.text for s in entry_soup.find_all(
  80. attrs={'class': 'graf graf--li graf-after--blockquote'}))
  81. entry_text.extend(s.text for s in entry_soup.find_all(
  82. attrs={'class': 'graf graf--li graf-after--pullquote'}))
  83. entry_text = ' '.join(entry_text)
  84. # Word count
  85. word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
  86. # Number of claps
  87. clap_pattern = re.compile(
  88. '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
  89. claps = entry_soup.find_all(text=clap_pattern)
  90. if len(claps) > 0:
  91. if 'K' in claps[0]:
  92. clap_number = int(1e3 * float(claps[0].split('K')[0]))
  93. else:
  94. clap_number = int(claps[0].split(' ')[0])
  95. else:
  96. clap_number = 0
  97. # Post tags
  98. tags = entry_soup.find_all(
  99. attrs={'class': 'tags tags--postTags tags--borderless'})
  100. tags = [li.text for li in tags[0].find_all('li')]
  101. # Responses to entry
  102. responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
  103. 'data-action': 'scroll-to-responses'})
  104. num_responses = int(responses[0].text) if len(responses) > 0 else 0
  105. # Store in dictionary
  106. entry_dict['title'] = title
  107. entry_dict['title_word_count'] = title_word_count
  108. entry_dict['text'] = entry_text
  109. entry_dict['word_count'] = word_count
  110. entry_dict['claps'] = clap_number
  111. entry_dict['tags'] = tags
  112. entry_dict['num_responses'] = num_responses
  113. # Time since publication
  114. entry_dict['days_since_publication'] = (
  115. datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
  116. return entry_dict
  117. def process_in_parallel(table_rows, processes=20):
  118. """
  119. Process all the stats in a table in parallel
  120. :note: make sure to set the correct time zone in `process_entry`
  121. :note: running on Mac may first require setting
  122. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  123. from the command line to enable parallel processing
  124. :param table_rows: BeautifulSoup table rows
  125. :param processes: integer number of processes (threads) to use in parallel
  126. :return df: dataframe of information about each post
  127. """
  128. # Convert to strings for multiprocessing
  129. table_rows_str = [str(r) for r in table_rows]
  130. # Process each article in paralllel
  131. pool = Pool(processes=processes)
  132. results = []
  133. start = timer()
  134. for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
  135. # Report progress
  136. print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
  137. results.append(r)
  138. pool.close()
  139. pool.join()
  140. end = timer()
  141. print(
  142. f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
  143. # Convert to dataframe
  144. df = pd.DataFrame(results)
  145. # Rename ratio
  146. df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
  147. # Add extra columns with more data
  148. df['claps_per_word'] = df['claps'] / df['word_count']
  149. df['editing_days'] = ((df['published_date'] - df['started_date']
  150. ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
  151. # Rounding
  152. df['published_date'] = df['published_date'].dt.round('min')
  153. df['started_date'] = df['started_date'].dt.round('min')
  154. df['read_ratio'] = df['read_ratio'].round(2)
  155. # 5 most common tags (might want to include more tags)
  156. n = 5
  157. all_tags = list(chain(*df['tags'].tolist()))
  158. tag_counts = Counter(all_tags)
  159. tags = tag_counts.most_common(n)
  160. # Adding columns with indication of tag
  161. for tag, count in tags:
  162. flag = [1 if tag in tags else 0 for tags in df['tags']]
  163. df.loc[:, f'<tag>{tag}'] = flag
  164. df.sort_values('published_date', inplace=True)
  165. return df
  166. def get_data(fname='stats.html', processes=20):
  167. """
  168. Retrieve medium article statistics
  169. :note: running on Mac may first require setting
  170. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  171. from the command line to enable parallel processing
  172. :param fname: file name (should be 'stats.html')
  173. :param processes: integer number of processes
  174. :return df: dataframe of article data
  175. """
  176. t = get_table_rows(fname=fname)
  177. return process_in_parallel(t, processes=processes)