bargraphs.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from bs4 import BeautifulSoup
  2. from selenium import webdriver
  3. from dateutil import parser
  4. from datetime import datetime, timedelta
  5. import pandas as pd
  6. import math
  7. import time
  8. from webdriver_manager.chrome import ChromeDriverManager
  9. def get_all_pages(driver, xpath, months, suffix):
  10. # Initially starting at today
  11. latest_date_in_graph = datetime.now().date()
  12. print('Starting on ', latest_date_in_graph)
  13. views = []
  14. dates = []
  15. # Iterate through the graphs
  16. for m in range(months + 1):
  17. graph_views = []
  18. graph_dates = []
  19. # Extract the bar graph
  20. bargraph = BeautifulSoup(driver.page_source).find_all(
  21. attrs={'class': 'bargraph'})[0]
  22. # Get all the bars in the bargraph
  23. bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
  24. # Sort the bar data by x position (which will be date order) with most recent first
  25. bardata = sorted(bardata, key=lambda x: float(
  26. x.get('x')), reverse=True)
  27. bardata = [bar.get('data-tooltip') for bar in bardata]
  28. latest_day = int(bardata[0].split('\xa0')[-1])
  29. # Some months are not overlapping
  30. if latest_day != latest_date_in_graph.day:
  31. latest_date_in_graph -= timedelta(days=1)
  32. # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
  33. for i, data in enumerate(bardata):
  34. graph_views.append(float(data.split(' ')[0].replace(',', '')))
  35. graph_dates.append(latest_date_in_graph - timedelta(days=i))
  36. views.extend(graph_views)
  37. dates.extend(graph_dates)
  38. # Find the earliest date in the graph
  39. earliest_date_in_graph = graph_dates[-1]
  40. # Update the latest date in the next graph
  41. latest_date_in_graph = earliest_date_in_graph
  42. # Go to the previous graph
  43. driver.find_element_by_xpath(xpath).click()
  44. time.sleep(2)
  45. print(f'{100 * m /(months):.0f}% complete.', end='\r')
  46. results = pd.DataFrame({'date': pd.to_datetime(
  47. dates), suffix: views}).groupby('date').sum()
  48. results = results.loc[results[results['views'] != 0.0].index.min():, ]
  49. print('First views on ', str(results.index.min().date()))
  50. # Save using the date as the file name
  51. fname = f'data/{str(datetime.now().date())}_{suffix}'
  52. results.to_parquet(fname)
  53. print('Stats saved to ', fname)
  54. return results
  55. if __name__ == "__main__":
  56. # Go to the website
  57. driver = webdriver.Chrome(ChromeDriverManager().install())
  58. driver.get("https://medium.com/me/stats")
  59. # Wait for user to log in
  60. input('Waiting for you to log in. Press enter when ready: ')
  61. # Find earliest date
  62. earliest_article_date = parser.parse(
  63. input('Enter earliest article date as string: ')).date()
  64. days = (datetime.now().date()
  65. - earliest_article_date).total_seconds() / (60 * 60 * 24)
  66. months = math.ceil(days / 30)
  67. # Get the xpath from user
  68. xpath = input('Paste xpath with no quotation marks: ')
  69. # Gather the results
  70. results = get_all_pages(driver, xpath, months, suffix='views')
  71. print('Refresh page and click on reads')
  72. # Get the xpath from user
  73. xpath = input('Paste xpath with no quotation marks: ')
  74. # Gather the results
  75. results = get_all_pages(driver, xpath, months, suffix='reads')
  76. print('Refresh page and click on fans')
  77. # Get the xpath from user
  78. xpath = input('Paste xpath with no quotation marks: ')
  79. # Gather the results
  80. results = get_all_pages(driver, xpath, months, suffix='fans')
  81. print("Complete. All results saved in data directory.")