retrieval.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. from multiprocessing import Pool
  2. import requests
  3. import re
  4. from bs4 import BeautifulSoup
  5. from itertools import chain
  6. from collections import Counter
  7. from timeit import default_timer as timer
  8. import pandas as pd
  9. from datetime import datetime
  10. def get_table_rows(fname="stats.html"):
  11. """
  12. Extract the table rows from the statistics
  13. :param fname: string name of the file stored in `data` directory
  14. :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
  15. """
  16. soup = BeautifulSoup(open(f"data/{fname}", "r", encoding="utf8"), features="lxml")
  17. table_rows = soup.find_all(attrs={"class": "sortableTable-row js-statsTableRow"})
  18. print(f"Found {len(table_rows)} entries in table.")
  19. return table_rows
  20. def convert_timestamp(ts: int, tz: str):
  21. """Convert a unix timestamp to a date timestamp"""
  22. return (
  23. pd.to_datetime(ts, origin="unix", unit="ms")
  24. .tz_localize("UTC")
  25. .tz_convert(tz)
  26. .tz_localize(None)
  27. )
  28. def process_entry(entry, parallel=True, tz="America/Chicago"):
  29. """
  30. Extract data from one entry in table
  31. :param entry: BeautifulSoup tag
  32. :param parallel: Boolean for whether function is being run in parallel
  33. :param tz: string representing timezone for started and published time
  34. :return entry_dict: dictionary with data about entry
  35. """
  36. # Convert to soup when running in parallel
  37. if parallel:
  38. entry = BeautifulSoup(entry, features="lxml").body.tr
  39. entry_dict = {}
  40. # Extract information
  41. for value, key in zip(
  42. entry.find_all(attrs={"class": "sortableTable-value"}),
  43. ["published_date", "views", "reads", "ratio", "fans"],
  44. ):
  45. entry_dict[key] = float(value.text) if key == "ratio" else int(value.text)
  46. entry_dict["read_time"] = int(
  47. entry.find_all(attrs={"class": "readingTime"})[0].get("title").split(" ")[0]
  48. )
  49. # Unlisted vs published
  50. entry_dict["type"] = (
  51. "unlisted" if len(entry.find_all(text=" Unlisted")) > 0 else "published"
  52. )
  53. # Publication
  54. publication = entry.find_all(attrs={"class": "sortableTable-text"})
  55. if "In" in publication[0].text:
  56. entry_dict["publication"] = publication[0].text.split("In ")[1].split("View")[0]
  57. else:
  58. entry_dict["publication"] = "None"
  59. # Convert datetimes
  60. entry_dict["published_date"] = convert_timestamp(
  61. entry_dict["published_date"], tz=tz
  62. )
  63. entry_dict["started_date"] = convert_timestamp(entry.get("data-timestamp"), tz=tz)
  64. # Get the link
  65. link = entry.find_all(text="View story", attrs={"class": "sortableTable-link"})[
  66. 0
  67. ].get("href")
  68. entry_dict["link"] = link
  69. # Retrieve the article and create a soup
  70. entry = requests.get(link).content
  71. entry_soup = BeautifulSoup(entry, features="lxml")
  72. # Get the title
  73. try:
  74. title = entry_soup.h1.text
  75. except:
  76. title = "response"
  77. title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
  78. # Main text entries
  79. entry_text = [
  80. p.text for p in entry_soup.find_all(["h1", "h2", "h3", "p", "blockquote"])
  81. ]
  82. # Make sure to catch everything
  83. entry_text.extend(
  84. s.text
  85. for s in entry_soup.find_all(attrs={"class": "graf graf--li graf-after--li"})
  86. )
  87. entry_text.extend(
  88. s.text
  89. for s in entry_soup.find_all(attrs={"class": "graf graf--li graf-after--p"})
  90. )
  91. entry_text.extend(
  92. s.text
  93. for s in entry_soup.find_all(
  94. attrs={"class": "graf graf--li graf-after--blockquote"}
  95. )
  96. )
  97. entry_text.extend(
  98. s.text
  99. for s in entry_soup.find_all(
  100. attrs={"class": "graf graf--li graf-after--pullquote"}
  101. )
  102. )
  103. entry_text = " ".join(entry_text)
  104. # Word count
  105. word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
  106. # Number of claps
  107. clap_pattern = re.compile(
  108. "^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps"
  109. )
  110. claps = entry_soup.find_all(text=clap_pattern)
  111. if len(claps) > 0:
  112. if "K" in claps[0]:
  113. clap_number = int(1e3 * float(claps[0].split("K")[0]))
  114. else:
  115. clap_number = int(claps[0].split(" ")[0])
  116. else:
  117. clap_number = 0
  118. # Post tags
  119. tags = entry_soup.find_all(attrs={"class": "tags tags--postTags tags--borderless"})
  120. tags = [li.text for li in tags[0].find_all("li")]
  121. # Responses to entry
  122. responses = entry_soup.find_all(
  123. attrs={
  124. "class": "button button--chromeless u-baseColor--buttonNormal u-marginRight12",
  125. "data-action": "scroll-to-responses",
  126. }
  127. )
  128. num_responses = int(responses[0].text) if len(responses) > 0 else 0
  129. # Store in dictionary
  130. entry_dict["title"] = title
  131. entry_dict["title_word_count"] = title_word_count
  132. entry_dict["text"] = entry_text
  133. entry_dict["word_count"] = word_count
  134. entry_dict["claps"] = clap_number
  135. entry_dict["tags"] = tags
  136. entry_dict["num_responses"] = num_responses
  137. # Time since publication
  138. entry_dict["days_since_publication"] = (
  139. datetime.now() - entry_dict["published_date"]
  140. ).total_seconds() / (3600 * 24)
  141. return entry_dict
  142. def process_in_parallel(table_rows, processes=20):
  143. """
  144. Process all the stats in a table in parallel
  145. :note: make sure to set the correct time zone in `process_entry`
  146. :note: running on Mac may first require setting
  147. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  148. from the command line to enable parallel processing
  149. :param table_rows: BeautifulSoup table rows
  150. :param processes: integer number of processes (threads) to use in parallel
  151. :return df: dataframe of information about each post
  152. """
  153. # Convert to strings for multiprocessing
  154. table_rows_str = [str(r) for r in table_rows]
  155. # Process each article in paralllel
  156. pool = Pool(processes=processes)
  157. results = []
  158. start = timer()
  159. for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
  160. # Report progress
  161. print(f"{100 * i / len(table_rows_str):.2f}% complete.", end="\r")
  162. results.append(r)
  163. pool.close()
  164. pool.join()
  165. end = timer()
  166. print(f"Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.")
  167. # Convert to dataframe
  168. df = pd.DataFrame(results)
  169. # Rename ratio
  170. df.rename(columns={"ratio": "read_ratio"}, inplace=True)
  171. # Add extra columns with more data
  172. df["claps_per_word"] = df["claps"] / df["word_count"]
  173. df["editing_days"] = (
  174. (df["published_date"] - df["started_date"]).dt.total_seconds() / (60 * 60 * 24)
  175. ).astype(int)
  176. # Rounding
  177. df["published_date"] = df["published_date"].dt.round("min")
  178. df["started_date"] = df["started_date"].dt.round("min")
  179. df["read_ratio"] = df["read_ratio"].round(2)
  180. # 5 most common tags (might want to include more tags)
  181. n = 5
  182. all_tags = list(chain(*df["tags"].tolist()))
  183. tag_counts = Counter(all_tags)
  184. tags = tag_counts.most_common(n)
  185. # Adding columns with indication of tag
  186. for tag, count in tags:
  187. flag = [1 if tag in tags else 0 for tags in df["tags"]]
  188. df.loc[:, f"<tag>{tag}"] = flag
  189. df.sort_values("published_date", inplace=True)
  190. return df
  191. def get_data(fname="stats.html", processes=20):
  192. """
  193. Retrieve medium article statistics
  194. :note: running on Mac may first require setting
  195. export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
  196. from the command line to enable parallel processing
  197. :param fname: file name (should be 'stats.html')
  198. :param processes: integer number of processes
  199. :return df: dataframe of article data
  200. """
  201. t = get_table_rows(fname=fname)
  202. return process_in_parallel(t, processes=processes)