bargraphs.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from bs4 import BeautifulSoup
  2. from selenium import webdriver
  3. from dateutil import parser
  4. from datetime import datetime, timedelta
  5. import pandas as pd
  6. import math
  7. import time
  8. driver = webdriver.Chrome()
  9. driver.get("https://medium.com/me/stats")
  10. input('Waiting for you to log in. Press enter when ready: ')
  11. earliest_article_date = parser.parse(
  12. input('Enter earliest article date as string: ')).date()
  13. days = (datetime.now().date()
  14. - earliest_article_date).total_seconds() / (60 * 60 * 24)
  15. months = math.ceil(days / 30)
  16. def get_all_pages(driver, xpath, months):
  17. # Initially starting at today
  18. latest_date_in_graph = datetime.now().date()
  19. print('Starting on ', latest_date_in_graph)
  20. views = []
  21. dates = []
  22. # Iterate through the graphs
  23. for m in range(months + 1):
  24. graph_views = []
  25. graph_dates = []
  26. # Extract the bar graph
  27. bargraph = BeautifulSoup(driver.page_source).find_all(
  28. attrs={'class': 'bargraph'})[0]
  29. # Get all the bars in the bargraph
  30. bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
  31. # Sort the bar data by x position (which will be date order) with most recent first
  32. bardata = sorted(bardata, key=lambda x: float(
  33. x.get('x')), reverse=True)
  34. bardata = [bar.get('data-tooltip') for bar in bardata]
  35. latest_day = int(bardata[0].split('\xa0')[-1])
  36. # Some months are not overlapping
  37. if latest_day != latest_date_in_graph.day:
  38. latest_date_in_graph -= timedelta(days=1)
  39. # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
  40. for i, data in enumerate(bardata):
  41. graph_views.append(float(data.split(' ')[0].replace(',', '')))
  42. graph_dates.append(latest_date_in_graph - timedelta(days=i))
  43. views.extend(graph_views)
  44. dates.extend(graph_dates)
  45. # Find the earliest date in the graph
  46. earliest_date_in_graph = graph_dates[-1]
  47. # Update the latest date in the next graph
  48. latest_date_in_graph = earliest_date_in_graph
  49. # Go to the previous graph
  50. driver.find_element_by_xpath(xpath).click()
  51. time.sleep(2)
  52. print(f'{100 * m /(months)}% complete.', end='\r')
  53. results = pd.DataFrame({'date': pd.to_datetime(
  54. dates), 'views': views}).groupby('date').sum()
  55. results = results.loc[results[results['views'] != 0.0].index.min():, ]
  56. print('First views on ', str(results.index.min().date()))
  57. return results
  58. xpath = input('Paste xpath as string: ')
  59. results = get_all_pages(driver, xpath, months)
  60. fname = f'{str(datetime.now().date())}_stats'
  61. results.to_parquet(fname)
  62. print('Stats saved to ', fname)