bargraphs.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. from bs4 import BeautifulSoup
  2. from selenium import webdriver
  3. from dateutil import parser
  4. from datetime import datetime, timedelta
  5. import pandas as pd
  6. import math
  7. import time
  8. from webdriver_manager.chrome import ChromeDriverManager
  9. def get_all_pages(driver, xpath, months, suffix):
  10. # Initially starting at today
  11. latest_date_in_graph = datetime.now().date()
  12. print("Starting on ", latest_date_in_graph)
  13. views = []
  14. dates = []
  15. # Iterate through the graphs
  16. for m in range(months + 1):
  17. graph_views = []
  18. graph_dates = []
  19. # Extract the bar graph
  20. bargraph = BeautifulSoup(driver.page_source).find_all(
  21. attrs={"class": "bargraph"}
  22. )[0]
  23. # Get all the bars in the bargraph
  24. bardata = bargraph.find_all(attrs={"class": "bargraph-bar"})
  25. # Sort the bar data by x position (which will be date order) with most recent first
  26. bardata = sorted(bardata, key=lambda x: float(x.get("x")), reverse=True)
  27. bardata = [bar.get("data-tooltip") for bar in bardata]
  28. latest_day = int(bardata[0].split("\xa0")[-1])
  29. # Some months are not overlapping
  30. if latest_day != latest_date_in_graph.day:
  31. latest_date_in_graph -= timedelta(days=1)
  32. # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
  33. for i, data in enumerate(bardata):
  34. graph_views.append(float(data.split(" ")[0].replace(",", "")))
  35. graph_dates.append(latest_date_in_graph - timedelta(days=i))
  36. views.extend(graph_views)
  37. dates.extend(graph_dates)
  38. # Find the earliest date in the graph
  39. earliest_date_in_graph = graph_dates[-1]
  40. # Update the latest date in the next graph
  41. latest_date_in_graph = earliest_date_in_graph
  42. # Go to the previous graph
  43. driver.find_element_by_xpath(xpath).click()
  44. time.sleep(2)
  45. print(f"{100 * m /(months):.0f}% complete.", end="\r")
  46. results = (
  47. pd.DataFrame({"date": pd.to_datetime(dates), suffix: views})
  48. .groupby("date")
  49. .sum()
  50. )
  51. results = results.loc[
  52. results[results["views"] != 0.0].index.min() :,
  53. ]
  54. print("First views on ", str(results.index.min().date()))
  55. # Save using the date as the file name
  56. fname = f"data/{str(datetime.now().date())}_{suffix}"
  57. results.to_parquet(fname)
  58. print("Stats saved to ", fname)
  59. return results
  60. if __name__ == "__main__":
  61. # Go to the website
  62. driver = webdriver.Chrome(ChromeDriverManager().install())
  63. driver.get("https://medium.com/me/stats")
  64. # Wait for user to log in
  65. input("Waiting for you to log in. Press enter when ready: ")
  66. # Find earliest date
  67. earliest_article_date = parser.parse(
  68. input("Enter earliest article date as string: ")
  69. ).date()
  70. days = (datetime.now().date() - earliest_article_date).total_seconds() / (
  71. 60 * 60 * 24
  72. )
  73. months = math.ceil(days / 30)
  74. # Get the xpath from user
  75. xpath = input("Paste xpath with no quotation marks: ")
  76. # Gather the results
  77. results = get_all_pages(driver, xpath, months, suffix="views")
  78. print("Refresh page and click on reads")
  79. # Get the xpath from user
  80. xpath = input("Paste xpath with no quotation marks: ")
  81. # Gather the results
  82. results = get_all_pages(driver, xpath, months, suffix="reads")
  83. print("Refresh page and click on fans")
  84. # Get the xpath from user
  85. xpath = input("Paste xpath with no quotation marks: ")
  86. # Gather the results
  87. results = get_all_pages(driver, xpath, months, suffix="fans")
  88. print("Complete. All results saved in data directory.")