4 лет назад · 9a40a78470
--- a/ai/Megatron/English/Python/jupyter_notebook/Lab2-4_customize_process2mmap.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Lab2-4_customize_process2mmap.ipynb
@@ -2,7 +2,7 @@
 
				  "cells": [
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "encouraging-melissa",
			
 
				+   "id": "fixed-species",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "## Customize preprocess_data.py\n",
			
@@ -27,7 +27,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "regulation-medium",
			
 
				+   "id": "comparative-render",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "1. Convert the extracted raw Swedish text from webnyheter2013.txt to webnyheter2013.json."
			
@@ -36,7 +36,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "alone-asian",
			
 
				+   "id": "alien-spanking",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -45,7 +45,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "corporate-honor",
			
 
				+   "id": "quiet-innocent",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Below is the expected outputs :\n",
			
@@ -58,7 +58,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "diagnostic-mercury",
			
 
				+   "id": "relative-execution",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "2. Generate the mmap format files by default preprocess_data.py as the first step to ensure we have data necessary for the next notebook to run, in case time runs out."
			
@@ -67,7 +67,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "unique-assistant",
			
 
				+   "id": "known-illness",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -81,7 +81,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "affecting-brave",
			
 
				+   "id": "least-platform",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -99,7 +99,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "mathematical-crawford",
			
 
				+   "id": "lined-literacy",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Below is the expected outputs :\n",
			
@@ -116,7 +116,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "involved-geneva",
			
 
				+   "id": "periodic-treaty",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Now we get the default mmap files (xxx.bin and xxx.idx ) and therefore guarantee we have the data needed for the next notebook to run disregard whether we finish the mini-challenge or not. \n",
			
@@ -131,7 +131,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "understood-poultry",
			
 
				+   "id": "norman-accreditation",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -140,7 +140,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "north-pension",
			
 
				+   "id": "maritime-bunny",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "<a id=\"Custom-Sentence-Splitter\"></a>"
			
@@ -148,7 +148,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "textile-australian",
			
 
				+   "id": "foreign-advocacy",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "The custom sentence-splitter `cut_sentence_with_quotation_marks` function is provided below for your convenience, please integrate this custom function into `MYpreprocess_data.py`."
			
@@ -157,7 +157,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "swedish-column",
			
 
				+   "id": "celtic-latter",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -192,7 +192,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "heated-drunk",
			
 
				+   "id": "bacterial-consequence",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "<a id=\"Mini-Challenge\"></a>"
			
@@ -200,7 +200,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "normal-rachel",
			
 
				+   "id": "separated-occupation",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -222,7 +222,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "adopted-yeast",
			
 
				+   "id": "unknown-seven",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -435,7 +435,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "accessory-banana",
			
 
				+   "id": "ruled-service",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Below cell block specify all the input parameters in order to run `MYpreprocess_data.py`. \n",
			
@@ -446,20 +446,20 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "overhead-hydrogen",
			
 
				+   "id": "simplified-antarctica",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "INPUT_JSON_FILE='../dataset/SV/webnyheter2013.json'\n",
			
 
				     "OUTPUT_PATH='../dataset/SV/customSentenceSplit'\n",
			
 
				-    "VOCAB_FILE='../dataset/SV/32k/vocab.json'\n",
			
 
				-    "MERGE_FILE='../dataset/SV/32k/merges.txt'\n",
			
 
				+    "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n",
			
 
				+    "MERGE_FILE='../dataset/SV/56k/merges.txt'\n",
			
 
				     "NUM_CPUS=16"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "interior-healthcare",
			
 
				+   "id": "understanding-things",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Below code block is a ReRun cell to launch `MYpreprocess_data.py` and produce the customSentenceSplit_text_document.bin and customSentenceSplit_text_document.idx files, if the script runs successfully.\n",
			
@@ -472,7 +472,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "compact-access",
			
 
				+   "id": "exclusive-region",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -490,7 +490,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "automatic-gravity",
			
 
				+   "id": "armed-german",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Check whether these two files : `customSentenceSplit_text_document.bin` and `customSentenceSplit_text_document.idx` files were successfully generated and is in the correct folder under dataset."
			
@@ -499,7 +499,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "substantial-spare",
			
 
				+   "id": "fantastic-harmony",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -509,7 +509,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "civic-airplane",
			
 
				+   "id": "final-stomach",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -519,7 +519,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "velvet-tennessee",
			
 
				+   "id": "still-movement",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",
			
@@ -528,7 +528,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "rising-ready",
			
 
				+   "id": "organized-mother",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",