from bs4 import BeautifulSoup from selenium import webdriver from dateutil import parser from datetime import datetime, timedelta import pandas as pd import math import time driver = webdriver.Chrome() driver.get("https://medium.com/me/stats") input('Waiting for you to log in. Press enter when ready: ') earliest_article_date = parser.parse( input('Enter earliest article date as string: ')).date() days = (datetime.now().date() - earliest_article_date).total_seconds() / (60 * 60 * 24) months = math.ceil(days / 30) def get_all_pages(driver, xpath, months): # Initially starting at today latest_date_in_graph = datetime.now().date() print('Starting on ', latest_date_in_graph) views = [] dates = [] # Iterate through the graphs for m in range(months + 1): graph_views = [] graph_dates = [] # Extract the bar graph bargraph = BeautifulSoup(driver.page_source).find_all( attrs={'class': 'bargraph'})[0] # Get all the bars in the bargraph bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'}) # Sort the bar data by x position (which will be date order) with most recent first bardata = sorted(bardata, key=lambda x: float( x.get('x')), reverse=True) bardata = [bar.get('data-tooltip') for bar in bardata] latest_day = int(bardata[0].split('\xa0')[-1]) # Some months are not overlapping if latest_day != latest_date_in_graph.day: latest_date_in_graph -= timedelta(days=1) # Iterate through the bars which now are sorted in reverse date order (newest to oldest) for i, data in enumerate(bardata): graph_views.append(float(data.split(' ')[0].replace(',', ''))) graph_dates.append(latest_date_in_graph - timedelta(days=i)) views.extend(graph_views) dates.extend(graph_dates) # Find the earliest date in the graph earliest_date_in_graph = graph_dates[-1] # Update the latest date in the next graph latest_date_in_graph = earliest_date_in_graph # Go to the previous graph driver.find_element_by_xpath(xpath).click() time.sleep(2) print(f'{100 * m /(months)}% complete.', end='\r') results = pd.DataFrame({'date': pd.to_datetime( dates), 'views': views}).groupby('date').sum() results = results.loc[results[results['views'] != 0.0].index.min():, ] print('First views on ', str(results.index.min().date())) return results xpath = input('Paste xpath as string: ') results = get_all_pages(driver, xpath, months) fname = f'{str(datetime.now().date())}_stats' results.to_parquet(fname) print('Stats saved to ', fname)