123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from dateutil import parser
- from datetime import datetime, timedelta
- import pandas as pd
- import math
- import time
- from webdriver_manager.chrome import ChromeDriverManager
- def get_all_pages(driver, xpath, months, suffix):
- # Initially starting at today
- latest_date_in_graph = datetime.now().date()
- print('Starting on ', latest_date_in_graph)
- views = []
- dates = []
- # Iterate through the graphs
- for m in range(months + 1):
- graph_views = []
- graph_dates = []
- # Extract the bar graph
- bargraph = BeautifulSoup(driver.page_source).find_all(
- attrs={'class': 'bargraph'})[0]
- # Get all the bars in the bargraph
- bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
- # Sort the bar data by x position (which will be date order) with most recent first
- bardata = sorted(bardata, key=lambda x: float(
- x.get('x')), reverse=True)
- bardata = [bar.get('data-tooltip') for bar in bardata]
- latest_day = int(bardata[0].split('\xa0')[-1])
- # Some months are not overlapping
- if latest_day != latest_date_in_graph.day:
- latest_date_in_graph -= timedelta(days=1)
- # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
- for i, data in enumerate(bardata):
- graph_views.append(float(data.split(' ')[0].replace(',', '')))
- graph_dates.append(latest_date_in_graph - timedelta(days=i))
- views.extend(graph_views)
- dates.extend(graph_dates)
- # Find the earliest date in the graph
- earliest_date_in_graph = graph_dates[-1]
- # Update the latest date in the next graph
- latest_date_in_graph = earliest_date_in_graph
- # Go to the previous graph
- driver.find_element_by_xpath(xpath).click()
- time.sleep(2)
- print(f'{100 * m /(months):.0f}% complete.', end='\r')
- results = pd.DataFrame({'date': pd.to_datetime(
- dates), suffix: views}).groupby('date').sum()
- results = results.loc[results[results['views'] != 0.0].index.min():, ]
- print('First views on ', str(results.index.min().date()))
- # Save using the date as the file name
- fname = f'data/{str(datetime.now().date())}_{suffix}'
- results.to_parquet(fname)
- print('Stats saved to ', fname)
- return results
- if __name__ == "__main__":
- # Go to the website
- driver = webdriver.Chrome(ChromeDriverManager().install())
- driver.get("https://medium.com/me/stats")
- # Wait for user to log in
- input('Waiting for you to log in. Press enter when ready: ')
- # Find earliest date
- earliest_article_date = parser.parse(
- input('Enter earliest article date as string: ')).date()
- days = (datetime.now().date()
- - earliest_article_date).total_seconds() / (60 * 60 * 24)
- months = math.ceil(days / 30)
- # Get the xpath from user
- xpath = input('Paste xpath with no quotation marks: ')
- # Gather the results
- results = get_all_pages(driver, xpath, months, suffix='views')
- print('Refresh page and click on reads')
- # Get the xpath from user
- xpath = input('Paste xpath with no quotation marks: ')
- # Gather the results
- results = get_all_pages(driver, xpath, months, suffix='reads')
- print('Refresh page and click on fans')
- # Get the xpath from user
- xpath = input('Paste xpath with no quotation marks: ')
- # Gather the results
- results = get_all_pages(driver, xpath, months, suffix='fans')
- print("Complete. All results saved in data directory.")
|