1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from dateutil import parser
- from datetime import datetime, timedelta
- import pandas as pd
- import math
- import time
- driver = webdriver.Chrome()
- driver.get("https://medium.com/me/stats")
- input('Waiting for you to log in. Press enter when ready: ')
- earliest_article_date = parser.parse(
- input('Enter earliest article date as string: ')).date()
- days = (datetime.now().date()
- - earliest_article_date).total_seconds() / (60 * 60 * 24)
- months = math.ceil(days / 30)
- def get_all_pages(driver, xpath, months):
- # Initially starting at today
- latest_date_in_graph = datetime.now().date()
- print('Starting on ', latest_date_in_graph)
- views = []
- dates = []
- # Iterate through the graphs
- for m in range(months + 1):
- graph_views = []
- graph_dates = []
- # Extract the bar graph
- bargraph = BeautifulSoup(driver.page_source).find_all(
- attrs={'class': 'bargraph'})[0]
- # Get all the bars in the bargraph
- bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
- # Sort the bar data by x position (which will be date order) with most recent first
- bardata = sorted(bardata, key=lambda x: float(
- x.get('x')), reverse=True)
- bardata = [bar.get('data-tooltip') for bar in bardata]
- latest_day = int(bardata[0].split('\xa0')[-1])
- # Some months are not overlapping
- if latest_day != latest_date_in_graph.day:
- latest_date_in_graph -= timedelta(days=1)
- # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
- for i, data in enumerate(bardata):
- graph_views.append(float(data.split(' ')[0].replace(',', '')))
- graph_dates.append(latest_date_in_graph - timedelta(days=i))
- views.extend(graph_views)
- dates.extend(graph_dates)
- # Find the earliest date in the graph
- earliest_date_in_graph = graph_dates[-1]
- # Update the latest date in the next graph
- latest_date_in_graph = earliest_date_in_graph
- # Go to the previous graph
- driver.find_element_by_xpath(xpath).click()
- time.sleep(2)
- print(f'{100 * m /(months)}% complete.', end='\r')
- results = pd.DataFrame({'date': pd.to_datetime(
- dates), 'views': views}).groupby('date').sum()
- results = results.loc[results[results['views'] != 0.0].index.min():, ]
- print('First views on ', str(results.index.min().date()))
- return results
- xpath = input('Paste xpath as string: ')
- results = get_all_pages(driver, xpath, months)
- fname = f'{str(datetime.now().date())}_stats'
- results.to_parquet(fname)
- print('Stats saved to ', fname)
|