import os from bs4 import BeautifulSoup from dateutil import parser from datetime import datetime import pandas as pd def process_bargraph(bargraph): bardata = [ bar.get("data-tooltip") for bar in bargraph.find_all(attrs={"class": "bargraph-bar"}) ] print(len(bardata)) return # Sort by xposition bardata = sorted(bardata, key=lambda x: float(x.get("x"))) views = [float(s.split(" ")[0].replace(",", "")) for s in bardata] dates = [ s.split(" ")[-1].split("\xa0")[0] + " " + s.split(" ")[-1].split("\xa0")[1] for s in bardata ] year = str((datetime.now() - pd.Timedelta(days=i * 30)).year) dates = [parser.parse(d + " " + year) for d in dates] return views, dates files = os.listdir("html_pages") v = [] d = [] for fid in files: i = int(fid.split(".")[0].split("p")[1]) graph = BeautifulSoup(open(f"html_pages/{fid}", "r")).find_all( attrs={"class": "bargraph"} )[0] r = process_bargraph(graph, i) v.extend(r[0]) d.extend(r[1]) results = pd.DataFrame({"date": d, "views": v}) results["date"] = pd.to_datetime(results["date"]) results.to_parquet("medium_views_time")