4 år sedan · 9a40a78470
--- a/ai/Megatron/English/Python/jupyter_notebook/Lab2-4_customize_process2mmap.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Lab2-4_customize_process2mmap.ipynb
@@ -2,7 +2,7 @@
 
																  "cells": [
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "encouraging-melissa",
															
 
																+   "id": "fixed-species",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "## Customize preprocess_data.py\n",
															
@@ -27,7 +27,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "regulation-medium",
															
 
																+   "id": "comparative-render",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "1. Convert the extracted raw Swedish text from webnyheter2013.txt to webnyheter2013.json."
															
@@ -36,7 +36,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "alone-asian",
															
 
																+   "id": "alien-spanking",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -45,7 +45,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "corporate-honor",
															
 
																+   "id": "quiet-innocent",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Below is the expected outputs :\n",
															
@@ -58,7 +58,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "diagnostic-mercury",
															
 
																+   "id": "relative-execution",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "2. Generate the mmap format files by default preprocess_data.py as the first step to ensure we have data necessary for the next notebook to run, in case time runs out."
															
@@ -67,7 +67,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "unique-assistant",
															
 
																+   "id": "known-illness",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -81,7 +81,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "affecting-brave",
															
 
																+   "id": "least-platform",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -99,7 +99,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "mathematical-crawford",
															
 
																+   "id": "lined-literacy",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Below is the expected outputs :\n",
															
@@ -116,7 +116,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "involved-geneva",
															
 
																+   "id": "periodic-treaty",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Now we get the default mmap files (xxx.bin and xxx.idx ) and therefore guarantee we have the data needed for the next notebook to run disregard whether we finish the mini-challenge or not. \n",
															
@@ -131,7 +131,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "understood-poultry",
															
 
																+   "id": "norman-accreditation",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -140,7 +140,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "north-pension",
															
 
																+   "id": "maritime-bunny",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "<a id=\"Custom-Sentence-Splitter\"></a>"
															
@@ -148,7 +148,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "textile-australian",
															
 
																+   "id": "foreign-advocacy",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "The custom sentence-splitter `cut_sentence_with_quotation_marks` function is provided below for your convenience, please integrate this custom function into `MYpreprocess_data.py`."
															
@@ -157,7 +157,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "swedish-column",
															
 
																+   "id": "celtic-latter",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -192,7 +192,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "heated-drunk",
															
 
																+   "id": "bacterial-consequence",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "<a id=\"Mini-Challenge\"></a>"
															
@@ -200,7 +200,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "normal-rachel",
															
 
																+   "id": "separated-occupation",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "---\n",
															
@@ -222,7 +222,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "adopted-yeast",
															
 
																+   "id": "unknown-seven",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -435,7 +435,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "accessory-banana",
															
 
																+   "id": "ruled-service",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Below cell block specify all the input parameters in order to run `MYpreprocess_data.py`. \n",
															
@@ -446,20 +446,20 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "overhead-hydrogen",
															
 
																+   "id": "simplified-antarctica",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
 
																     "INPUT_JSON_FILE='../dataset/SV/webnyheter2013.json'\n",
															
 
																     "OUTPUT_PATH='../dataset/SV/customSentenceSplit'\n",
															
 
																-    "VOCAB_FILE='../dataset/SV/32k/vocab.json'\n",
															
 
																-    "MERGE_FILE='../dataset/SV/32k/merges.txt'\n",
															
 
																+    "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n",
															
 
																+    "MERGE_FILE='../dataset/SV/56k/merges.txt'\n",
															
 
																     "NUM_CPUS=16"
															
 
																    ]
															
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "interior-healthcare",
															
 
																+   "id": "understanding-things",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Below code block is a ReRun cell to launch `MYpreprocess_data.py` and produce the customSentenceSplit_text_document.bin and customSentenceSplit_text_document.idx files, if the script runs successfully.\n",
															
@@ -472,7 +472,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "compact-access",
															
 
																+   "id": "exclusive-region",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -490,7 +490,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "automatic-gravity",
															
 
																+   "id": "armed-german",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "Check whether these two files : `customSentenceSplit_text_document.bin` and `customSentenceSplit_text_document.idx` files were successfully generated and is in the correct folder under dataset."
															
@@ -499,7 +499,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "substantial-spare",
															
 
																+   "id": "fantastic-harmony",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -509,7 +509,7 @@
 
																   {
															
 
																    "cell_type": "code",
															
 
																    "execution_count": null,
															
 
																-   "id": "civic-airplane",
															
 
																+   "id": "final-stomach",
															
 
																    "metadata": {},
															
 
																    "outputs": [],
															
 
																    "source": [
															
@@ -519,7 +519,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "velvet-tennessee",
															
 
																+   "id": "still-movement",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "-----\n",
															
@@ -528,7 +528,7 @@
 
																   },
															
 
																   {
															
 
																    "cell_type": "markdown",
															
 
																-   "id": "rising-ready",
															
 
																+   "id": "organized-mother",
															
 
																    "metadata": {},
															
 
																    "source": [
															
 
																     "-----\n",