Kaynağa Gözat

Adding solutions to first page views tutorial

Marc Garcia 6 yıl önce
ebeveyn
işleme
108460edf0

+ 53 - 0
03_Page_views_wrangling.ipynb

@@ -0,0 +1,53 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas\n",
+    "\n",
+    "DATA_DIR = os.path.join('data', 'pandas_website')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load solutions/page_views_1.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 46 - 0
04_Page_views_eda.ipynb

@@ -0,0 +1,46 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas\n",
+    "\n",
+    "DATA_DIR = os.path.join('data', 'pandas_website')\n",
+    "\n",
+    "df = pandas.read_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

BIN
data/pandas_website/pandas_website_views_2018.parquet


+ 2 - 1
environment.yml

@@ -4,6 +4,7 @@ channels:
   - defaults
 dependencies:
   - python=3.7
+  - jupyter=1.0
   - pandas=0.25
+  - pyarrow=0.14.1
   - matplotlib=2.2
-  - jupyter=1.0

+ 23 - 0
solutions/page_views_1.py

@@ -0,0 +1,23 @@
+import locale
+import glob
+
+
+locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
+
+
+(pandas.concat(pandas.read_csv(fname,
+                               comment='#',
+                               dtype={'Pageviews': str})
+                     .head(5_000)
+               for fname in glob.glob(os.path.join(DATA_DIR, '*.csv.gz')))
+       .set_index('Page')
+       .dropna()
+       .drop(columns='Page Value')
+       .assign(**{'Pageviews': lambda df: df['Pageviews'].apply(locale.atoi),
+                  'Unique Pageviews': lambda df: df['Unique Pageviews'].apply(locale.atoi),
+                  'Avg. Time on Page': lambda df: pandas.to_timedelta(df['Avg. Time on Page'].str.lstrip('<')).dt.seconds,
+                  'Entrances': lambda df: df['Entrances'].apply(locale.atoi),
+                  'Bounce Rate': lambda df: df['Bounce Rate'].str.rstrip('%').astype(float),
+                  '% Exit': lambda df: df['% Exit'].str.rstrip('%').astype(float)})
+       .to_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'),
+                   engine='pyarrow'))