view_extraction.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import os
  2. from bs4 import BeautifulSoup
  3. from dateutil import parser
  4. from datetime import datetime
  5. import pandas as pd
  6. def process_bargraph(bargraph):
  7. bardata = [
  8. bar.get("data-tooltip")
  9. for bar in bargraph.find_all(attrs={"class": "bargraph-bar"})
  10. ]
  11. print(len(bardata))
  12. return
  13. # Sort by xposition
  14. bardata = sorted(bardata, key=lambda x: float(x.get("x")))
  15. views = [float(s.split(" ")[0].replace(",", "")) for s in bardata]
  16. dates = [
  17. s.split(" ")[-1].split("\xa0")[0] + " " + s.split(" ")[-1].split("\xa0")[1]
  18. for s in bardata
  19. ]
  20. year = str((datetime.now() - pd.Timedelta(days=i * 30)).year)
  21. dates = [parser.parse(d + " " + year) for d in dates]
  22. return views, dates
  23. files = os.listdir("html_pages")
  24. v = []
  25. d = []
  26. for fid in files:
  27. i = int(fid.split(".")[0].split("p")[1])
  28. graph = BeautifulSoup(open(f"html_pages/{fid}", "r")).find_all(
  29. attrs={"class": "bargraph"}
  30. )[0]
  31. r = process_bargraph(graph, i)
  32. v.extend(r[0])
  33. d.extend(r[1])
  34. results = pd.DataFrame({"date": d, "views": v})
  35. results["date"] = pd.to_datetime(results["date"])
  36. results.to_parquet("medium_views_time")