view_extraction.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import os
  2. from bs4 import BeautifulSoup
  3. from dateutil import parser
  4. from datetime import datetime
  5. import pandas as pd
  6. def process_bargraph(bargraph):
  7. bardata = [bar.get('data-tooltip')
  8. for bar in bargraph.find_all(attrs={'class': 'bargraph-bar'})]
  9. print(len(bardata))
  10. return
  11. # Sort by xposition
  12. bardata = sorted(bardata, key=lambda x: float(x.get('x')))
  13. views = [float(s.split(' ')[0].replace(',', '')) for s in bardata]
  14. dates = [s.split(' ')[-1].split('\xa0')[0] + ' '
  15. + s.split(' ')[-1].split('\xa0')[1] for s in bardata]
  16. year = str((datetime.now() - pd.Timedelta(days=i * 30)).year)
  17. dates = [parser.parse(d + ' ' + year) for d in dates]
  18. return views, dates
  19. files = os.listdir('html_pages')
  20. v = []
  21. d = []
  22. for fid in files:
  23. i = int(fid.split('.')[0].split('p')[1])
  24. graph = BeautifulSoup(
  25. open(f'html_pages/{fid}', 'r')).find_all(attrs={'class': 'bargraph'})[0]
  26. r = process_bargraph(graph, i)
  27. v.extend(r[0])
  28. d.extend(r[1])
  29. results = pd.DataFrame({'date': d, 'views': v})
  30. results['date'] = pd.to_datetime(results['date'])
  31. results.to_parquet('medium_views_time')