6 лет назад · 9b8d62db52
--- a/03_Docstring_errors_wrangling.ipynb
+++ b/03_Docstring_errors_wrangling.ipynb
@@ -0,0 +1,252 @@
 
																+{
															
 
																+ "cells": [
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "# Tutorial: Errors in the pandas API reference\n",
															
 
																+    "\n",
															
 
																+    "In Python, documentation of objects is defined in the objects themselves. For example:\n",
															
 
																+    "\n",
															
 
																+    "```python\n",
															
 
																+    "def divide(dividend, divisor):\n",
															
 
																+    "    \"\"\"\n",
															
 
																+    "    Compute the division of two floating point numbers.\n",
															
 
																+    "    \n",
															
 
																+    "    Parameters\n",
															
 
																+    "    ----------\n",
															
 
																+    "    dividend : float\n",
															
 
																+    "        Number to divide.\n",
															
 
																+    "    divisor : float\n",
															
 
																+    "        Number to divide by.\n",
															
 
																+    "    \n",
															
 
																+    "    Returns\n",
															
 
																+    "    -------\n",
															
 
																+    "    float:\n",
															
 
																+    "        The result of the division.\n",
															
 
																+    "    \"\"\"\n",
															
 
																+    "    return dividend / divisor\n",
															
 
																+    "```\n",
															
 
																+    "\n",
															
 
																+    "There are tools to extract this documentation (named docstrings), and generate the\n",
															
 
																+    "web version of it.\n",
															
 
																+    "\n",
															
 
																+    "To make sure the documentation is formatted correctly in the web, and to keep consistency among\n",
															
 
																+    "the pages, there are some standards that we aim to follow. For historical reasons, many\n",
															
 
																+    "docstrings don't follow these standards.\n",
															
 
																+    "\n",
															
 
																+    "Some of the errors are next (they are codified with a code):\n",
															
 
																+    "\n",
															
 
																+    "- **SS02**: Summary does not start with a capital letter\n",
															
 
																+    "- **SS03**: Summary does not end with a period\n",
															
 
																+    "- **PR01**: Parameters {missing_params} not documented\n",
															
 
																+    "- **RT01**: No Returns section found\n",
															
 
																+    "\n",
															
 
																+    "The next docstring would return them:\n",
															
 
																+    "\n",
															
 
																+    "```python\n",
															
 
																+    "def divide(dividend, divisor):\n",
															
 
																+    "    \"\"\"\n",
															
 
																+    "    compute the division of two floating point numbers\n",
															
 
																+    "    \n",
															
 
																+    "    Parameters\n",
															
 
																+    "    ----------\n",
															
 
																+    "    dividend : float\n",
															
 
																+    "        Number to divide.\n",
															
 
																+    "    \"\"\"\n",
															
 
																+    "    return dividend / divisor\n",
															
 
																+    "```\n",
															
 
																+    "\n",
															
 
																+    "We developed a script that is able to automatically detect these errors and report\n",
															
 
																+    "them. It can return all the errors in the whole pandas code base in json format with\n",
															
 
																+    "the next command:\n",
															
 
																+    "\n",
															
 
																+    "```\n",
															
 
																+    "./scripts/validate_docstrings.py --format=json > docstring_errors_pandas023.json.gz\n",
															
 
																+    "```\n",
															
 
																+    "\n",
															
 
																+    "In this tutorial we will load the output of this script, and we will transform it\n",
															
 
																+    "to keep the relevant information"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "import os\n",
															
 
																+    "import pandas\n",
															
 
																+    "\n",
															
 
																+    "DATA_FNAME = os.path.join('data', 'docstring_errors_pandas023.json.gz')"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Load data\n",
															
 
																+    "\n",
															
 
																+    "- Load data from the json file `DATA_FNAME`\n",
															
 
																+    "- Try reading the data with different `orient` values, and read it so every row is a docstring"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### New columns\n",
															
 
																+    "\n",
															
 
																+    "- Create a column `docstring_length` with the number of characters of the docstring\n",
															
 
																+    "- Create a column `problems` with the list or errors and warnings in a single list"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Delete information not needed\n",
															
 
																+    "\n",
															
 
																+    "- Remove docstrings of functions being deprecated\n",
															
 
																+    "- Remove columns `errors` and `warnings`\n",
															
 
																+    "- Remove the `docstring`"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Create a row per problem\n",
															
 
																+    "\n",
															
 
																+    "- Discuss possible ways of creating a row for each problem in the lists of the column `problem`\n",
															
 
																+    "- Check the size of the `DataFrame`\n",
															
 
																+    "- Calculate the expected new size\n",
															
 
																+    "- Perform the transformation to have one row per problem\n",
															
 
																+    "- Check that the new `DataFrame` has the expected size"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Extract problem information\n",
															
 
																+    "\n",
															
 
																+    "- Get the problem information of the first row in the `DataFrame`\n",
															
 
																+    "- How can we get the values for the `code` and the `message` independently\n",
															
 
																+    "- Implement it for the whole column at the same time\n",
															
 
																+    "- Discuss if there are other ways to extract them"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Save data as categories\n",
															
 
																+    "\n",
															
 
																+    "- Check the number of unique values in every column\n",
															
 
																+    "- Discuss what are the advantages of using categories\n",
															
 
																+    "- Check which is the memory usage of the `DataFrame`\n",
															
 
																+    "- Convert to categories the columns that make sense\n",
															
 
																+    "- Check again the memory usage"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Save data to disk\n",
															
 
																+    "\n",
															
 
																+    "- Save data into `data/docstring_errors_pandas023.hd5`\n",
															
 
																+    "- Discuss what is the effect of the parameter `key` and try more than one value\n",
															
 
																+    "- Load the data again from the format\n",
															
 
																+    "- Check whether the data is still the same after reloading it, what is the cause if not, and how to fix it"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Solution"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "%load solutions/pandas_docstrings.py"
															
 
																+   ]
															
 
																+  }
															
 
																+ ],
															
 
																+ "metadata": {
															
 
																+  "kernelspec": {
															
 
																+   "display_name": "Python 3",
															
 
																+   "language": "python",
															
 
																+   "name": "python3"
															
 
																+  },
															
 
																+  "language_info": {
															
 
																+   "codemirror_mode": {
															
 
																+    "name": "ipython",
															
 
																+    "version": 3
															
 
																+   },
															
 
																+   "file_extension": ".py",
															
 
																+   "mimetype": "text/x-python",
															
 
																+   "name": "python",
															
 
																+   "nbconvert_exporter": "python",
															
 
																+   "pygments_lexer": "ipython3",
															
 
																+   "version": "3.7.3"
															
 
																+  }
															
 
																+ },
															
 
																+ "nbformat": 4,
															
 
																+ "nbformat_minor": 2
															
 
																+}
															
--- a/03_Page_views_wrangling.ipynb
+++ b/03_Page_views_wrangling.ipynb
@@ -1,53 +0,0 @@
 
																-{
															
 
																- "cells": [
															
 
																-  {
															
 
																-   "cell_type": "code",
															
 
																-   "execution_count": null,
															
 
																-   "metadata": {},
															
 
																-   "outputs": [],
															
 
																-   "source": [
															
 
																-    "import os\n",
															
 
																-    "import pandas\n",
															
 
																-    "\n",
															
 
																-    "DATA_DIR = os.path.join('data', 'pandas_website')"
															
 
																-   ]
															
 
																-  },
															
 
																-  {
															
 
																-   "cell_type": "code",
															
 
																-   "execution_count": null,
															
 
																-   "metadata": {},
															
 
																-   "outputs": [],
															
 
																-   "source": [
															
 
																-    "%load solutions/page_views_1.py"
															
 
																-   ]
															
 
																-  },
															
 
																-  {
															
 
																-   "cell_type": "code",
															
 
																-   "execution_count": null,
															
 
																-   "metadata": {},
															
 
																-   "outputs": [],
															
 
																-   "source": []
															
 
																-  }
															
 
																- ],
															
 
																- "metadata": {
															
 
																-  "kernelspec": {
															
 
																-   "display_name": "Python 3",
															
 
																-   "language": "python",
															
 
																-   "name": "python3"
															
 
																-  },
															
 
																-  "language_info": {
															
 
																-   "codemirror_mode": {
															
 
																-    "name": "ipython",
															
 
																-    "version": 3
															
 
																-   },
															
 
																-   "file_extension": ".py",
															
 
																-   "mimetype": "text/x-python",
															
 
																-   "name": "python",
															
 
																-   "nbconvert_exporter": "python",
															
 
																-   "pygments_lexer": "ipython3",
															
 
																-   "version": "3.7.3"
															
 
																-  }
															
 
																- },
															
 
																- "nbformat": 4,
															
 
																- "nbformat_minor": 2
															
 
																-}
															
--- a/04_Page_views_eda.ipynb
+++ b/04_Page_views_eda.ipynb
@@ -1,46 +0,0 @@
 
																-{
															
 
																- "cells": [
															
 
																-  {
															
 
																-   "cell_type": "code",
															
 
																-   "execution_count": null,
															
 
																-   "metadata": {},
															
 
																-   "outputs": [],
															
 
																-   "source": [
															
 
																-    "import os\n",
															
 
																-    "import pandas\n",
															
 
																-    "\n",
															
 
																-    "DATA_DIR = os.path.join('data', 'pandas_website')\n",
															
 
																-    "\n",
															
 
																-    "df = pandas.read_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'))"
															
 
																-   ]
															
 
																-  },
															
 
																-  {
															
 
																-   "cell_type": "code",
															
 
																-   "execution_count": null,
															
 
																-   "metadata": {},
															
 
																-   "outputs": [],
															
 
																-   "source": []
															
 
																-  }
															
 
																- ],
															
 
																- "metadata": {
															
 
																-  "kernelspec": {
															
 
																-   "display_name": "Python 3",
															
 
																-   "language": "python",
															
 
																-   "name": "python3"
															
 
																-  },
															
 
																-  "language_info": {
															
 
																-   "codemirror_mode": {
															
 
																-    "name": "ipython",
															
 
																-    "version": 3
															
 
																-   },
															
 
																-   "file_extension": ".py",
															
 
																-   "mimetype": "text/x-python",
															
 
																-   "name": "python",
															
 
																-   "nbconvert_exporter": "python",
															
 
																-   "pygments_lexer": "ipython3",
															
 
																-   "version": "3.7.3"
															
 
																-  }
															
 
																- },
															
 
																- "nbformat": 4,
															
 
																- "nbformat_minor": 2
															
 
																-}
															
--- a/04_Page_views_wrangling.ipynb
+++ b/04_Page_views_wrangling.ipynb
@@ -0,0 +1,145 @@
 
																+{
															
 
																+ "cells": [
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "# Tutorial: pandas website views\n",
															
 
																+    "\n",
															
 
																+    "A measure to know how relevant is a class or function to pandas users,\n",
															
 
																+    "is the number of visits to its page in the documentation.\n",
															
 
																+    "\n",
															
 
																+    "We use Google analytics to track visits to the pandas website.\n",
															
 
																+    "This is the dashboard for page views per visits, in 2018:\n",
															
 
																+    "\n",
															
 
																+    "![](img/pandas_website_views.png)\n",
															
 
																+    "\n",
															
 
																+    "While Google analytics has an API, it doesn't make it easy for users\n",
															
 
																+    "to download the data. So, we downloaded all the visits from that page\n",
															
 
																+    "with the `Export` option, which downloads the information visible in the page.\n",
															
 
																+    "We did that 20 times, for the first 20 pages of results, and saved the data\n",
															
 
																+    "in the `data/pandas_website` directory.\n",
															
 
																+    "\n",
															
 
																+    "In this tutorial we will load data from csv files, we will concatenate them,\n",
															
 
																+    "and we will transform the data into a format useful to analyze."
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "import os\n",
															
 
																+    "import pandas\n",
															
 
																+    "\n",
															
 
																+    "DATA_DIR = os.path.join('data', 'pandas_website')"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Load first csv file into a pandas DataFrame\n",
															
 
																+    "\n",
															
 
																+    "- Load data from the first csv into a DataFrame\n",
															
 
																+    "- Explore the data, size, data types of columns, how the values look like, if there are missing values,..."
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Convert data into the right format with the right types\n",
															
 
																+    "\n",
															
 
																+    "- Drop the `Page Value` column, since it doesn't contain useful information\n",
															
 
																+    "- Set the `Page` as the index, so we can access rows by the page\n",
															
 
																+    "- Convert every column to its numerical type, so we can operate with them"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Repeat for all data\n",
															
 
																+    "\n",
															
 
																+    "- Repeat the same for all available files, and get a single `DataFrame` with all the data"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Save data\n",
															
 
																+    "\n",
															
 
																+    "- Save the final `DataFrame` into the file `pandas_page_views_2018.parquet`\n",
															
 
																+    "- Find information about the parquet format, and discuss what are the advantages compared to other formats"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": []
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "markdown",
															
 
																+   "metadata": {},
															
 
																+   "source": [
															
 
																+    "### Solution"
															
 
																+   ]
															
 
																+  },
															
 
																+  {
															
 
																+   "cell_type": "code",
															
 
																+   "execution_count": null,
															
 
																+   "metadata": {},
															
 
																+   "outputs": [],
															
 
																+   "source": [
															
 
																+    "%load solutions/page_views_wrangling.py"
															
 
																+   ]
															
 
																+  }
															
 
																+ ],
															
 
																+ "metadata": {
															
 
																+  "kernelspec": {
															
 
																+   "display_name": "Python 3",
															
 
																+   "language": "python",
															
 
																+   "name": "python3"
															
 
																+  },
															
 
																+  "language_info": {
															
 
																+   "codemirror_mode": {
															
 
																+    "name": "ipython",
															
 
																+    "version": 3
															
 
																+   },
															
 
																+   "file_extension": ".py",
															
 
																+   "mimetype": "text/x-python",
															
 
																+   "name": "python",
															
 
																+   "nbconvert_exporter": "python",
															
 
																+   "pygments_lexer": "ipython3",
															
 
																+   "version": "3.7.3"
															
 
																+  }
															
 
																+ },
															
 
																+ "nbformat": 4,
															
 
																+ "nbformat_minor": 2
															
 
																+}
															
--- a/data/pandas_website/pandas_website_views_2018.parquet
+++ b/data/pandas_website/pandas_website_views_2018.parquet
--- a/img/pandas_website_views.png
+++ b/img/pandas_website_views.png
--- a/solutions/page_views_1.py
+++ b/solutions/page_views_1.py
@@ -1,5 +1,7 @@
 
																 import locale
															
 
																 import glob
															
 
																+import os
															
 
																+import pandas
															
 
																 locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
															
@@ -19,5 +21,5 @@ locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
 
																                   'Entrances': lambda df: df['Entrances'].apply(locale.atoi),
															
 
																                   'Bounce Rate': lambda df: df['Bounce Rate'].str.rstrip('%').astype(float),
															
 
																                   '% Exit': lambda df: df['% Exit'].str.rstrip('%').astype(float)})
															
 
																-       .to_parquet(os.path.join(DATA_DIR, 'pandas_website_views_2018.parquet'),
															
 
																+       .to_parquet(os.path.join('data', 'pandas_page_views_2018.parquet'),
															
 
																                    engine='pyarrow'))
															
--- a/solutions/pandas_docstrings.py
+++ b/solutions/pandas_docstrings.py
@@ -0,0 +1,20 @@
 
																+import os
															
 
																+import pandas
															
 
																+
															
 
																+
															
 
																+(pandas.read_json(DATA_FNAME, orient='index')
															
 
																+       .query('not deprecated')
															
 
																+       .assign(problems=lambda df: df.errors + df.warnings,
															
 
																+               section=lambda df: df['section'].astype('category'),
															
 
																+               type=lambda df: df['type'].astype('category'),
															
 
																+               docstring_length=lambda df: df['docstring'].str.len())
															
 
																+       .explode('problems')
															
 
																+       .pipe(lambda df: df.join(df['problems'].apply(pandas.Series)
															
 
																+                                              .rename(columns={0: 'code',
															
 
																+                                                               1: 'message'})))
															
 
																+       .assign(code=lambda df: df['code'].astype('category'))
															
 
																+       .loc[:, ['docstring_length', 'section', 'type', 'code', 'message']]
															
 
																+       .to_hdf(os.path.join('data', 'docstring_errors_pandas023.hd5'),
															
 
																+               key='main',
															
 
																+               mode='w',
															
 
																+               format='table'))