retrieval.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. from multiprocessing import Pool
  2. import requests
  3. import re
  4. from bs4 import BeautifulSoup
  5. from itertools import chain
  6. from collections import Counter
  7. from timeit import default_timer as timer
  8. import pandas as pd
  9. from datetime import datetime
  10. def get_table_rows(fname='stats.html'):
  11. """
  12. Extract the table rows from the statistics
  13. :param fname: string name of the file stored in `data` directory
  14. :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
  15. """
  16. soup = BeautifulSoup(
  17. open(f'data/{fname}', 'r', encoding='utf8'), features='lxml')
  18. table_rows = soup.find_all(
  19. attrs={'class': "sortableTable-row js-statsTableRow"})
  20. print(f'Found {len(table_rows)} entries in table.')
  21. return table_rows
  22. def convert_timestamp(ts: int, tz: str):
  23. """Convert a unix timestamp to a date timestamp"""
  24. return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
  25. def process_entry(entry, parallel=True, tz='America/Chicago'):
  26. """
  27. Extract data from one entry in table
  28. :param entry: BeautifulSoup tag
  29. :param parallel: Boolean for whether function is being run in parallel
  30. :param tz: string representing timezone for started and published time
  31. :return entry_dict: dictionary with data about entry
  32. """
  33. # Convert to soup when running in parallel
  34. if parallel:
  35. entry = BeautifulSoup(entry, features='lxml').body.tr
  36. entry_dict = {}
  37. # Extract information
  38. for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
  39. ['published_date', 'views', 'reads', 'ratio', 'fans']):
  40. entry_dict[key] = float(
  41. value.text) if key == 'ratio' else int(value.text)
  42. entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
  43. 0].get('title').split(' ')[0])
  44. # Unlisted vs published
  45. entry_dict['type'] = 'unlisted' if len(
  46. entry.find_all(text=' Unlisted')) > 0 else 'published'
  47. # Publication
  48. publication = entry.find_all(attrs={'class': 'sortableTable-text'})
  49. if 'In' in publication[0].text:
  50. entry_dict['publication'] = publication[0].text.split('In ')[
  51. 1].split('View')[0]
  52. else:
  53. entry_dict['publication'] = 'None'
  54. # Convert datetimes
  55. entry_dict['published_date'] = convert_timestamp(
  56. entry_dict['published_date'], tz=tz)
  57. entry_dict['started_date'] = convert_timestamp(
  58. entry.get('data-timestamp'), tz=tz)
  59. # Get the link
  60. link = entry.find_all(text='View story',
  61. attrs={'class': 'sortableTable-link'})[0].get('href')
  62. entry_dict['link'] = link
  63. # Retrieve the article and create a soup
  64. entry = requests.get(link).content
  65. entry_soup = BeautifulSoup(entry, features='lxml')
  66. # Get the title
  67. try:
  68. title = entry_soup.h1.text
  69. except:
  70. title = 'response'
  71. title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
  72. # Main text entries
  73. entry_text = [p.text for p in entry_soup.find_all(
  74. ['h1', 'h2', 'h3', 'p', 'blockquote'])]
  75. # Make sure to catch everything
  76. entry_text.extend(s.text for s in entry_soup.find_all(
  77. attrs={'class': 'graf graf--li graf-after--li'}))
  78. entry_text.extend(s.text for s in entry_soup.find_all(
  79. attrs={'class': 'graf graf--li graf-after--p'}))
  80. entry_text.extend(s.text for s in entry_soup.find_all(
  81. attrs={'class': 'graf graf--li graf-after--blockquote'}))
  82. entry_text.extend(s.text for s in entry_soup.find_all(
  83. attrs={'class': 'graf graf--li graf-after--pullquote'}))
  84. entry_text = ' '.join(entry_text)
  85. # Word count
  86. word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
  87. # Number of claps
  88. clap_pattern = re.compile(
  89. '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
  90. claps = entry_soup.find_all(text=clap_pattern)
  91. if len(claps) > 0:
  92. if 'K' in claps[0]:
  93. clap_number = int(1e3 * float(claps[0].split('K')[0]))
  94. else:
  95. clap_number = int(claps[0].split(' ')[0])
  96. else:
  97. clap_number = 0
  98. # Post tags
  99. tags = entry_soup.find_all(
  100. attrs={'class': 'tags tags--postTags tags--borderless'})
  101. tags = [li.text for li in tags[0].find_all('li')]
  102. # Responses to entry
  103. responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
  104. 'data-action': 'scroll-to-responses'})
  105. num_responses = int(responses[0].text) if len(responses) > 0 else 0
  106. # Store in dictionary
  107. entry_dict['title'] = title
  108. entry_dict['title_word_count'] = title_word_count
  109. entry_dict['text'] = entry_text
  110. entry_dict['word_count'] = word_count
  111. entry_dict['claps'] = clap_number
  112. entry_dict['tags'] = tags
  113. entry_dict['num_responses'] = num_responses
  114. # Time since publication
  115. entry_dict['days_since_publication'] = (
  116. datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
  117. return entry_dict
  118. def process_in_parallel(table_rows, processes=20):
  119. """
  120. Process all the stats in a table in parallel
  121. :note: make sure to set the correct time zone in `process_entry`
  122. :note: running on Mac may first require setting
  123. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  124. from the command line to enable parallel processing
  125. :param table_rows: BeautifulSoup table rows
  126. :param processes: integer number of processes (threads) to use in parallel
  127. :return df: dataframe of information about each post
  128. """
  129. # Convert to strings for multiprocessing
  130. table_rows_str = [str(r) for r in table_rows]
  131. # Process each article in paralllel
  132. pool = Pool(processes=processes)
  133. results = []
  134. start = timer()
  135. for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
  136. # Report progress
  137. print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
  138. results.append(r)
  139. pool.close()
  140. pool.join()
  141. end = timer()
  142. print(
  143. f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
  144. # Convert to dataframe
  145. df = pd.DataFrame(results)
  146. # Rename ratio
  147. df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
  148. # Add extra columns with more data
  149. df['claps_per_word'] = df['claps'] / df['word_count']
  150. df['editing_days'] = ((df['published_date'] - df['started_date']
  151. ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
  152. # Rounding
  153. df['published_date'] = df['published_date'].dt.round('min')
  154. df['started_date'] = df['started_date'].dt.round('min')
  155. df['read_ratio'] = df['read_ratio'].round(2)
  156. # 5 most common tags (might want to include more tags)
  157. n = 5
  158. all_tags = list(chain(*df['tags'].tolist()))
  159. tag_counts = Counter(all_tags)
  160. tags = tag_counts.most_common(n)
  161. # Adding columns with indication of tag
  162. for tag, count in tags:
  163. flag = [1 if tag in tags else 0 for tags in df['tags']]
  164. df.loc[:, f'<tag>{tag}'] = flag
  165. df.sort_values('published_date', inplace=True)
  166. return df
  167. def get_data(fname='stats.html', processes=20):
  168. """
  169. Retrieve medium article statistics
  170. :note: running on Mac may first require setting
  171. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  172. from the command line to enable parallel processing
  173. :param fname: file name (should be 'stats.html')
  174. :param processes: integer number of processes
  175. :return df: dataframe of article data
  176. """
  177. t = get_table_rows(fname=fname)
  178. return process_in_parallel(t, processes=processes)