il y a 6 ans · 9b8d62db52
--- a/03_Docstring_errors_wrangling.ipynb
+++ b/03_Docstring_errors_wrangling.ipynb
@@ -0,0 +1,252 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Tutorial: Errors in the pandas API reference\n",
			
 
				+    "\n",
			
 
				+    "In Python, documentation of objects is defined in the objects themselves. For example:\n",
			
 
				+    "\n",
			
 
				+    "```python\n",
			
 
				+    "def divide(dividend, divisor):\n",
			
 
				+    "    \"\"\"\n",
			
 
				+    "    Compute the division of two floating point numbers.\n",
			
 
				+    "    \n",
			
 
				+    "    Parameters\n",
			
 
				+    "    ----------\n",
			
 
				+    "    dividend : float\n",
			
 
				+    "        Number to divide.\n",
			
 
				+    "    divisor : float\n",
			
 
				+    "        Number to divide by.\n",
			
 
				+    "    \n",
			
 
				+    "    Returns\n",
			
 
				+    "    -------\n",
			
 
				+    "    float:\n",
			
 
				+    "        The result of the division.\n",
			
 
				+    "    \"\"\"\n",
			
 
				+    "    return dividend / divisor\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "There are tools to extract this documentation (named docstrings), and generate the\n",
			
 
				+    "web version of it.\n",
			
 
				+    "\n",
			
 
				+    "To make sure the documentation is formatted correctly in the web, and to keep consistency among\n",
			
 
				+    "the pages, there are some standards that we aim to follow. For historical reasons, many\n",
			
 
				+    "docstrings don't follow these standards.\n",
			
 
				+    "\n",
			
 
				+    "Some of the errors are next (they are codified with a code):\n",
			
 
				+    "\n",
			
 
				+    "- **SS02**: Summary does not start with a capital letter\n",
			
 
				+    "- **SS03**: Summary does not end with a period\n",
			
 
				+    "- **PR01**: Parameters {missing_params} not documented\n",
			
 
				+    "- **RT01**: No Returns section found\n",
			
 
				+    "\n",
			
 
				+    "The next docstring would return them:\n",
			
 
				+    "\n",
			
 
				+    "```python\n",
			
 
				+    "def divide(dividend, divisor):\n",
			
 
				+    "    \"\"\"\n",
			
 
				+    "    compute the division of two floating point numbers\n",
			
 
				+    "    \n",
			
 
				+    "    Parameters\n",
			
 
				+    "    ----------\n",
			
 
				+    "    dividend : float\n",
			
 
				+    "        Number to divide.\n",
			
 
				+    "    \"\"\"\n",
			
 
				+    "    return dividend / divisor\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "We developed a script that is able to automatically detect these errors and report\n",
			
 
				+    "them. It can return all the errors in the whole pandas code base in json format with\n",
			
 
				+    "the next command:\n",
			
 
				+    "\n",
			
 
				+    "```\n",
			
 
				+    "./scripts/validate_docstrings.py --format=json > docstring_errors_pandas023.json.gz\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "In this tutorial we will load the output of this script, and we will transform it\n",
			
 
				+    "to keep the relevant information"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "import pandas\n",
			
 
				+    "\n",
			
 
				+    "DATA_FNAME = os.path.join('data', 'docstring_errors_pandas023.json.gz')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Load data\n",
			
 
				+    "\n",
			
 
				+    "- Load data from the json file `DATA_FNAME`\n",
			
 
				+    "- Try reading the data with different `orient` values, and read it so every row is a docstring"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### New columns\n",
			
 
				+    "\n",
			
 
				+    "- Create a column `docstring_length` with the number of characters of the docstring\n",
			
 
				+    "- Create a column `problems` with the list or errors and warnings in a single list"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Delete information not needed\n",
			
 
				+    "\n",
			
 
				+    "- Remove docstrings of functions being deprecated\n",
			
 
				+    "- Remove columns `errors` and `warnings`\n",
			
 
				+    "- Remove the `docstring`"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Create a row per problem\n",
			
 
				+    "\n",
			
 
				+    "- Discuss possible ways of creating a row for each problem in the lists of the column `problem`\n",
			
 
				+    "- Check the size of the `DataFrame`\n",
			
 
				+    "- Calculate the expected new size\n",
			
 
				+    "- Perform the transformation to have one row per problem\n",
			
 
				+    "- Check that the new `DataFrame` has the expected size"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Extract problem information\n",
			
 
				+    "\n",
			
 
				+    "- Get the problem information of the first row in the `DataFrame`\n",
			
 
				+    "- How can we get the values for the `code` and the `message` independently\n",
			
 
				+    "- Implement it for the whole column at the same time\n",
			
 
				+    "- Discuss if there are other ways to extract them"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Save data as categories\n",
			
 
				+    "\n",
			
 
				+    "- Check the number of unique values in every column\n",
			
 
				+    "- Discuss what are the advantages of using categories\n",
			
 
				+    "- Check which is the memory usage of the `DataFrame`\n",
			
 
				+    "- Convert to categories the columns that make sense\n",
			
 
				+    "- Check again the memory usage"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Save data to disk\n",
			
 
				+    "\n",
			
 
				+    "- Save data into `data/docstring_errors_pandas023.hd5`\n",
			
 
				+    "- Discuss what is the effect of the parameter `key` and try more than one value\n",
			
 
				+    "- Load the data again from the format\n",
			
 
				+    "- Check whether the data is still the same after reloading it, what is the cause if not, and how to fix it"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Solution"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%load solutions/pandas_docstrings.py"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.3"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/03_Page_views_wrangling.ipynb
+++ b/03_Page_views_wrangling.ipynb
@@ -1,53 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import os\n",
			
 
				-    "import pandas\n",
			
 
				-    "\n",
			
 
				-    "DATA_DIR = os.path.join('data', 'pandas_website')"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "%load solutions/page_views_1.py"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.3"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 2
			
 
				-}
			
--- a/04_Page_views_eda.ipynb
+++ b/04_Page_views_eda.ipynb
@@ -1,46 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import os\n",
			
 
				-    "import pandas\n",
			
 
				-    "\n",
			
 
				-    "DATA_DIR = os.path.join('data', 'pandas_website')\n",
			
 
				-    "\n",
			
 
				-    "df = pandas.read_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.3"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 2
			
 
				-}
			
--- a/04_Page_views_wrangling.ipynb
+++ b/04_Page_views_wrangling.ipynb
@@ -0,0 +1,145 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Tutorial: pandas website views\n",
			
 
				+    "\n",
			
 
				+    "A measure to know how relevant is a class or function to pandas users,\n",
			
 
				+    "is the number of visits to its page in the documentation.\n",
			
 
				+    "\n",
			
 
				+    "We use Google analytics to track visits to the pandas website.\n",
			
 
				+    "This is the dashboard for page views per visits, in 2018:\n",
			
 
				+    "\n",
			
 
				+    "![](img/pandas_website_views.png)\n",
			
 
				+    "\n",
			
 
				+    "While Google analytics has an API, it doesn't make it easy for users\n",
			
 
				+    "to download the data. So, we downloaded all the visits from that page\n",
			
 
				+    "with the `Export` option, which downloads the information visible in the page.\n",
			
 
				+    "We did that 20 times, for the first 20 pages of results, and saved the data\n",
			
 
				+    "in the `data/pandas_website` directory.\n",
			
 
				+    "\n",
			
 
				+    "In this tutorial we will load data from csv files, we will concatenate them,\n",
			
 
				+    "and we will transform the data into a format useful to analyze."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "import pandas\n",
			
 
				+    "\n",
			
 
				+    "DATA_DIR = os.path.join('data', 'pandas_website')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Load first csv file into a pandas DataFrame\n",
			
 
				+    "\n",
			
 
				+    "- Load data from the first csv into a DataFrame\n",
			
 
				+    "- Explore the data, size, data types of columns, how the values look like, if there are missing values,..."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Convert data into the right format with the right types\n",
			
 
				+    "\n",
			
 
				+    "- Drop the `Page Value` column, since it doesn't contain useful information\n",
			
 
				+    "- Set the `Page` as the index, so we can access rows by the page\n",
			
 
				+    "- Convert every column to its numerical type, so we can operate with them"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Repeat for all data\n",
			
 
				+    "\n",
			
 
				+    "- Repeat the same for all available files, and get a single `DataFrame` with all the data"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Save data\n",
			
 
				+    "\n",
			
 
				+    "- Save the final `DataFrame` into the file `pandas_page_views_2018.parquet`\n",
			
 
				+    "- Find information about the parquet format, and discuss what are the advantages compared to other formats"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Solution"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%load solutions/page_views_wrangling.py"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.3"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/data/pandas_website/pandas_website_views_2018.parquet
+++ b/data/pandas_website/pandas_website_views_2018.parquet
--- a/img/pandas_website_views.png
+++ b/img/pandas_website_views.png
--- a/solutions/page_views_1.py
+++ b/solutions/page_views_1.py
@@ -1,5 +1,7 @@
 
				 import locale
			
 
				 import glob
			
 
				+import os
			
 
				+import pandas
			
 
				 
			
 
				 
			
 
				 locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
			
@@ -19,5 +21,5 @@ locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
 
				                   'Entrances': lambda df: df['Entrances'].apply(locale.atoi),
			
 
				                   'Bounce Rate': lambda df: df['Bounce Rate'].str.rstrip('%').astype(float),
			
 
				                   '% Exit': lambda df: df['% Exit'].str.rstrip('%').astype(float)})
			
 
				-       .to_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'),
			
 
				+       .to_parquet(os.path.join('data', 'pandas_page_views_2018.parquet'),
			
 
				                    engine='pyarrow'))
			
--- a/solutions/pandas_docstrings.py
+++ b/solutions/pandas_docstrings.py
@@ -0,0 +1,20 @@
 
				+import os
			
 
				+import pandas
			
 
				+
			
 
				+
			
 
				+(pandas.read_json(DATA_FNAME, orient='index')
			
 
				+       .query('not deprecated')
			
 
				+       .assign(problems=lambda df: df.errors + df.warnings,
			
 
				+               section=lambda df: df['section'].astype('category'),
			
 
				+               type=lambda df: df['type'].astype('category'),
			
 
				+               docstring_length=lambda df: df['docstring'].str.len())
			
 
				+       .explode('problems')
			
 
				+       .pipe(lambda df: df.join(df['problems'].apply(pandas.Series)
			
 
				+                                              .rename(columns={0: 'code',
			
 
				+                                                               1: 'message'})))
			
 
				+       .assign(code=lambda df: df['code'].astype('category'))
			
 
				+       .loc[:, ['docstring_length', 'section', 'type', 'code', 'message']]
			
 
				+       .to_hdf(os.path.join('data', 'docstring_errors_pandas023.hd5'),
			
 
				+               key='main',
			
 
				+               mode='w',
			
 
				+               format='table'))