retrieval.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # Data science imports
  2. from multiprocessing import Pool
  3. import requests
  4. import re
  5. from bs4 import BeautifulSoup
  6. from itertools import chain
  7. from collections import Counter
  8. from timeit import default_timer as timer
  9. import pandas as pd
  10. from datetime import datetime
  11. import pytz
  12. def get_table_rows(fname='stats.html'):
  13. """
  14. Extract the table rows from the statistics
  15. :param fname: string name of the file stored in `data` directory
  16. :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
  17. """
  18. soup = BeautifulSoup(open(f'data/{fname}', 'r'), features='lxml')
  19. table_rows = soup.find_all(
  20. attrs={'class': "sortableTable-row js-statsTableRow"})
  21. print(f'Found {len(table_rows)} entries in table.')
  22. return table_rows
  23. def convert_timestamp(ts: int, tz: str):
  24. """Convert a unix timestamp to a date timestamp"""
  25. return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
  26. def process_entry(entry, parallel=True, tz='America/Chicago'):
  27. """
  28. Extract data from one entry in table
  29. :param entry: BeautifulSoup tag
  30. :param parallel: Boolean for whether function is being run in parallel
  31. :param tz: string representing timezone for started and published time
  32. :return entry_dict: dictionary with data about entry
  33. """
  34. # Convert to soup when running in parallel
  35. if parallel:
  36. entry = BeautifulSoup(entry, features='lxml').body.tr
  37. entry_dict = {}
  38. # Extract information
  39. for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
  40. ['published_date', 'views', 'reads', 'ratio', 'fans']):
  41. entry_dict[key] = float(
  42. value.text) if key == 'ratio' else int(value.text)
  43. entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
  44. 0].get('title').split(' ')[0])
  45. # Unlisted vs published
  46. entry_dict['type'] = 'unlisted' if len(
  47. entry.find_all(text=' Unlisted')) > 0 else 'published'
  48. # Publication
  49. publication = entry.find_all(attrs={'class': 'sortableTable-text'})
  50. if 'In' in publication[0].text:
  51. entry_dict['publication'] = publication[0].text.split('In ')[
  52. 1].split('View')[0]
  53. else:
  54. entry_dict['publication'] = 'None'
  55. # Convert datetimes
  56. entry_dict['published_date'] = convert_timestamp(
  57. entry_dict['published_date'], tz=tz)
  58. entry_dict['started_date'] = convert_timestamp(
  59. entry.get('data-timestamp'), tz=tz)
  60. # Get the link
  61. link = entry.find_all(text='View story',
  62. attrs={'class': 'sortableTable-link'})[0].get('href')
  63. # Retrieve the article and create a soup
  64. entry = requests.get(link).content
  65. entry_soup = BeautifulSoup(entry, features='lxml')
  66. # Get the title
  67. try:
  68. title = entry_soup.h1.text
  69. except:
  70. title = 'response'
  71. title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
  72. # Main text entries
  73. entry_text = [p.text for p in entry_soup.find_all(
  74. ['h1', 'h2', 'h3', 'p', 'blockquote'])]
  75. # Make sure to catch everything
  76. entry_text.extend(s.text for s in entry_soup.find_all(
  77. attrs={'class': 'graf graf--li graf-after--li'}))
  78. entry_text.extend(s.text for s in entry_soup.find_all(
  79. attrs={'class': 'graf graf--li graf-after--p'}))
  80. entry_text.extend(s.text for s in entry_soup.find_all(
  81. attrs={'class': 'graf graf--li graf-after--blockquote'}))
  82. entry_text.extend(s.text for s in entry_soup.find_all(
  83. attrs={'class': 'graf graf--li graf-after--pullquote'}))
  84. entry_text = ' '.join(entry_text)
  85. # Word count
  86. word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
  87. # Number of claps
  88. clap_pattern = re.compile(
  89. '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
  90. claps = entry_soup.find_all(text=clap_pattern)
  91. if len(claps) > 0:
  92. if 'K' in claps[0]:
  93. clap_number = int(1e3 * float(claps[0].split('K')[0]))
  94. else:
  95. clap_number = int(claps[0].split(' ')[0])
  96. else:
  97. clap_number = 0
  98. # Post tags
  99. tags = entry_soup.find_all(
  100. attrs={'class': 'tags tags--postTags tags--borderless'})
  101. tags = [li.text for li in tags[0].find_all('li')]
  102. # Responses to entry
  103. responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
  104. 'data-action': 'scroll-to-responses'})
  105. num_responses = int(responses[0].text) if len(responses) > 0 else 0
  106. # Store in dictionary
  107. entry_dict['title'] = title
  108. entry_dict['title_word_count'] = title_word_count
  109. entry_dict['text'] = entry_text
  110. entry_dict['word_count'] = word_count
  111. entry_dict['claps'] = clap_number
  112. entry_dict['tags'] = tags
  113. entry_dict['num_responses'] = num_responses
  114. # Time since publication
  115. entry_dict['days_since_publication'] = (
  116. datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
  117. return entry_dict
  118. def process_in_parallel(table_rows, processes=20):
  119. """
  120. Process all the stats in a table in parallel
  121. :note: make sure to set the correct time zone in `process_entry`
  122. :param table_rows: BeautifulSoup table rows
  123. :param processes: integer number of processes (threads) to use in parallel
  124. :return df: dataframe of information about each post
  125. """
  126. # Convert to strings for multiprocessing
  127. table_rows_str = [str(r) for r in table_rows]
  128. # Process each article in paralllel
  129. pool = Pool(processes=processes)
  130. results = []
  131. start = timer()
  132. for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
  133. # Report progress
  134. print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
  135. results.append(r)
  136. pool.close()
  137. pool.join()
  138. end = timer()
  139. print(
  140. f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
  141. # Convert to dataframe
  142. df = pd.DataFrame(results)
  143. # Rename ratio
  144. df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
  145. # Add extra columns with more data
  146. df['claps_per_word'] = df['claps'] / df['word_count']
  147. df['editing_days'] = ((df['published_date'] - df['started_date']
  148. ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
  149. # Rounding
  150. df['published_date'] = df['published_date'].dt.round('min')
  151. df['started_date'] = df['started_date'].dt.round('min')
  152. df['read_ratio'] = df['read_ratio'].round(2)
  153. # 5 most common tags (might want to include more tags)
  154. n = 5
  155. all_tags = list(chain(*df['tags'].tolist()))
  156. tag_counts = Counter(all_tags)
  157. tags = tag_counts.most_common(n)
  158. # Adding columns with indication of tag
  159. for tag, count in tags:
  160. flag = [1 if tag in tags else 0 for tags in df['tags']]
  161. df.loc[:, f'<tag>{tag}'] = flag
  162. df.sort_values('published_date', inplace=True)
  163. return df
  164. def get_data(fname='stats.html', processes=20):
  165. t = get_table_rows(fname=fname)
  166. return process_in_parallel(t, processes=processes)