radu
/
Data-Analysis-Jupyter
mirror of https://github.com/WillKoehrsen/Data-Analysis.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
							from bs4 import BeautifulSoup
from selenium import webdriver
from dateutil import parser
from datetime import datetime, timedelta
import pandas as pd
import math
import time

from webdriver_manager.chrome import ChromeDriverManager


def get_all_pages(driver, xpath, months, suffix):

    # Initially starting at today
    latest_date_in_graph = datetime.now().date()

    print("Starting on ", latest_date_in_graph)

    views = []
    dates = []

    # Iterate through the graphs
    for m in range(months + 1):
        graph_views = []
        graph_dates = []
        # Extract the bar graph
        bargraph = BeautifulSoup(driver.page_source).find_all(
            attrs={"class": "bargraph"}
        )[0]

        # Get all the bars in the bargraph
        bardata = bargraph.find_all(attrs={"class": "bargraph-bar"})
        # Sort the bar data by x position (which will be date order) with most recent first
        bardata = sorted(bardata, key=lambda x: float(x.get("x")), reverse=True)
        bardata = [bar.get("data-tooltip") for bar in bardata]
        latest_day = int(bardata[0].split("\xa0")[-1])

        # Some months are not overlapping
        if latest_day != latest_date_in_graph.day:
            latest_date_in_graph -= timedelta(days=1)

        # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
        for i, data in enumerate(bardata):
            graph_views.append(float(data.split(" ")[0].replace(",", "")))
            graph_dates.append(latest_date_in_graph - timedelta(days=i))

        views.extend(graph_views)
        dates.extend(graph_dates)
        # Find the earliest date in the graph
        earliest_date_in_graph = graph_dates[-1]

        # Update the latest date in the next graph
        latest_date_in_graph = earliest_date_in_graph

        # Go to the previous graph
        driver.find_element_by_xpath(xpath).click()
        time.sleep(2)
        print(f"{100 * m /(months):.0f}% complete.", end="\r")

    results = (
        pd.DataFrame({"date": pd.to_datetime(dates), suffix: views})
        .groupby("date")
        .sum()
    )
    results = results.loc[
        results[results["views"] != 0.0].index.min() :,
    ]
    print("First views on ", str(results.index.min().date()))

    # Save using the date as the file name
    fname = f"data/{str(datetime.now().date())}_{suffix}"
    results.to_parquet(fname)
    print("Stats saved to ", fname)

    return results


if __name__ == "__main__":
    # Go to the website
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get("https://medium.com/me/stats")
    # Wait for user to log in
    input("Waiting for you to log in. Press enter when ready: ")

    # Find earliest date
    earliest_article_date = parser.parse(
        input("Enter earliest article date as string: ")
    ).date()
    days = (datetime.now().date() - earliest_article_date).total_seconds() / (
        60 * 60 * 24
    )
    months = math.ceil(days / 30)

    # Get the xpath from user
    xpath = input("Paste xpath with no quotation marks: ")
    # Gather the results
    results = get_all_pages(driver, xpath, months, suffix="views")
    print("Refresh page and click on reads")

    # Get the xpath from user
    xpath = input("Paste xpath with no quotation marks: ")
    # Gather the results
    results = get_all_pages(driver, xpath, months, suffix="reads")

    print("Refresh page and click on fans")
    # Get the xpath from user
    xpath = input("Paste xpath with no quotation marks: ")
    # Gather the results
    results = get_all_pages(driver, xpath, months, suffix="fans")
    print("Complete. All results saved in data directory.")