page_views_wrangling.py 1.1 KB

1234567891011121314151617181920212223242526
  1. import locale
  2. import glob
  3. import os
  4. import pandas
  5. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  6. (pandas.concat(pandas.read_csv(fname,
  7. comment='#',
  8. dtype={'Pageviews': str})
  9. .head(5_000)
  10. for fname in glob.glob(os.path.join(DATA_DIR, '*.csv.gz')))
  11. .set_index('Page')
  12. .dropna()
  13. .drop(columns='Page Value')
  14. .assign(**{'Pageviews': lambda df: df['Pageviews'].apply(locale.atoi),
  15. 'Unique Pageviews': lambda df: df['Unique Pageviews'].apply(locale.atoi),
  16. 'Avg. Time on Page': lambda df: pandas.to_timedelta(df['Avg. Time on Page'].str.lstrip('<')).dt.seconds,
  17. 'Entrances': lambda df: df['Entrances'].apply(locale.atoi),
  18. 'Bounce Rate': lambda df: df['Bounce Rate'].str.rstrip('%').astype(float),
  19. '% Exit': lambda df: df['% Exit'].str.rstrip('%').astype(float)})
  20. .to_parquet(os.path.join('data', 'pandas_page_views_2018.parquet'),
  21. engine='pyarrow'))