page_views_eda.py 588 B

123456789101112131415161718
  1. import operator
  2. import urllib.parse
  3. import pandas
  4. page_views = pandas.read_parquet(PAGE_VIEWS_FNAME)
  5. page_views.index = (page_views.index
  6. .to_series()
  7. .apply(urllib.parse.urlparse)
  8. .apply(operator.attrgetter('path'))
  9. .str.split('/')
  10. .str[-1]
  11. .str.rstrip('.html'))
  12. docstring_errors = (pandas.read_hdf(DOCSTRING_ERRORS_FNAME)
  13. .join(page_views.groupby('Page')['Pageviews'].sum()))