Просмотр исходного кода

correcting paths sets and comments on how to modify

zenodia 3 лет назад
Родитель
Сommit
a21e44d2d1
22 измененных файлов с 14313 добавлено и 2310 удалено
  1. 44 49
      ai/Megatron/English/Python/Start_Here.ipynb
  2. 165 26
      ai/Megatron/English/Python/jupyter_notebook/Day2-3_GPT_vocab_merge_files.ipynb
  3. 60 29
      ai/Megatron/English/Python/jupyter_notebook/Day2-4_jsonfy_and_process2mmap.ipynb
  4. 341 265
      ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb
  5. 322 559
      ai/Megatron/English/Python/jupyter_notebook/Day3-3_train_own_GPT2BPETokenizer.ipynb
  6. 12622 20
      ai/Megatron/English/Python/jupyter_notebook/Day3-4_customize_process2mmap.ipynb
  7. 155 114
      ai/Megatron/English/Python/jupyter_notebook/Day3-5_run_Megatron_with_varying_config.ipynb
  8. 0 126
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/Dlprof_pretrain_gpt.py
  9. 0 49
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/nsys_test.sh
  10. 5 5
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_2nd_run.sh
  11. 11 11
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_SVGPT_BIG.sh
  12. 7 7
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_naive_run.sh
  13. 0 76
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/resume_iter1M_SVpretrainGPT3_2.7B.sh
  14. 1 0
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/sv_utils/trainGPTTokenizer.py
  15. 0 670
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-(option)Website_scrapping.ipynb
  16. 31 40
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-1_acquiring_data.ipynb
  17. 312 150
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-2_SentenceBoundary_and_Deduplicate.ipynb
  18. 221 102
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb
  19. 1 0
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/fetchURLs_and_write2html.sh
  20. 6 6
      ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/verify_GPT3_Svenska.sh
  21. 5 5
      ai/Megatron/English/Python/jupyter_notebook/params_cnt.sh
  22. 4 1
      ai/Megatron/English/Python/source_code/create_dir_and_download_pytorch_sif_file.sh

+ 44 - 49
ai/Megatron/English/Python/Start_Here.ipynb

@@ -60,29 +60,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Wed Aug 25 07:03:55 2021       \n",
+      "Wed Sep 15 09:14:15 2021       \n",
       "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 450.51.05    Driver Version: 450.51.05    CUDA Version: 11.2     |\n",
+      "| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |\n",
       "|-------------------------------+----------------------+----------------------+\n",
       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
       "|                               |                      |               MIG M. |\n",
       "|===============================+======================+======================|\n",
-      "|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |\n",
-      "| N/A   34C    P0    57W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
+      "|   0  A100-SXM4-40GB      On   | 00000000:07:00.0 Off |                    0 |\n",
+      "| N/A   24C    P0    57W / 400W |      0MiB / 40536MiB |      4%      Default |\n",
+      "|                               |                      |             Disabled |\n",
       "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |\n",
-      "| N/A   30C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   2  Tesla V100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |\n",
-      "| N/A   31C    P0    41W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   3  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |\n",
-      "| N/A   33C    P0    40W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
+      "|   1  A100-SXM4-40GB      On   | 00000000:0F:00.0 Off |                    0 |\n",
+      "| N/A   24C    P0    53W / 400W |      0MiB / 40536MiB |      0%      Default |\n",
+      "|                               |                      |             Disabled |\n",
       "+-------------------------------+----------------------+----------------------+\n",
       "                                                                               \n",
       "+-----------------------------------------------------------------------------+\n",
@@ -132,34 +124,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-b29deceb-3745-51d2-2cf3-807ea8ac8e60)\n",
-      "\t Link 0: 25.781 GB/s\n",
-      "\t Link 1: 25.781 GB/s\n",
-      "\t Link 2: 25.781 GB/s\n",
-      "\t Link 3: 25.781 GB/s\n",
-      "\t Link 4: 25.781 GB/s\n",
-      "\t Link 5: 25.781 GB/s\n",
-      "GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-4de46420-3e95-182f-c0c3-d488dda562d8)\n",
-      "\t Link 0: 25.781 GB/s\n",
-      "\t Link 1: 25.781 GB/s\n",
-      "\t Link 2: 25.781 GB/s\n",
-      "\t Link 3: 25.781 GB/s\n",
-      "\t Link 4: 25.781 GB/s\n",
-      "\t Link 5: 25.781 GB/s\n",
-      "GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-8e9b4e82-ac7f-c189-cc17-045a3585def2)\n",
-      "\t Link 0: 25.781 GB/s\n",
-      "\t Link 1: 25.781 GB/s\n",
-      "\t Link 2: 25.781 GB/s\n",
-      "\t Link 3: 25.781 GB/s\n",
-      "\t Link 4: 25.781 GB/s\n",
-      "\t Link 5: 25.781 GB/s\n",
-      "GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-a3d96d2e-c606-b23f-e9e0-59a3a507fc10)\n",
-      "\t Link 0: 25.781 GB/s\n",
-      "\t Link 1: 25.781 GB/s\n",
-      "\t Link 2: 25.781 GB/s\n",
-      "\t Link 3: 25.781 GB/s\n",
-      "\t Link 4: 25.781 GB/s\n",
-      "\t Link 5: 25.781 GB/s\n"
+      "GPU 0: A100-SXM4-40GB (UUID: GPU-2e4d2105-718d-3b94-6f0f-25c148681e83)\n",
+      "\t Link 0: 25 GB/s\n",
+      "\t Link 1: 25 GB/s\n",
+      "\t Link 2: 25 GB/s\n",
+      "\t Link 3: 25 GB/s\n",
+      "\t Link 4: 25 GB/s\n",
+      "\t Link 5: 25 GB/s\n",
+      "\t Link 6: 25 GB/s\n",
+      "\t Link 7: 25 GB/s\n",
+      "\t Link 8: 25 GB/s\n",
+      "\t Link 9: 25 GB/s\n",
+      "\t Link 10: 25 GB/s\n",
+      "\t Link 11: 25 GB/s\n",
+      "GPU 1: A100-SXM4-40GB (UUID: GPU-49615223-919e-6f9f-ad79-69d86bc1a13b)\n",
+      "\t Link 0: 25 GB/s\n",
+      "\t Link 1: 25 GB/s\n",
+      "\t Link 2: 25 GB/s\n",
+      "\t Link 3: 25 GB/s\n",
+      "\t Link 4: 25 GB/s\n",
+      "\t Link 5: 25 GB/s\n",
+      "\t Link 6: 25 GB/s\n",
+      "\t Link 7: 25 GB/s\n",
+      "\t Link 8: 25 GB/s\n",
+      "\t Link 9: 25 GB/s\n",
+      "\t Link 10: 25 GB/s\n",
+      "\t Link 11: 25 GB/s\n"
      ]
     }
    ],
@@ -198,12 +188,12 @@
      "text": [
       "\n",
       "Sampling Environment Check\n",
-      "Linux Kernel Paranoid Level = 1: OK\n",
+      "Linux Kernel Paranoid Level = 2: OK\n",
       "Linux Distribution = Ubuntu\n",
-      "Linux Kernel Version = 4.15.0-112-generic: OK\n",
+      "Linux Kernel Version = 4.18.0-305.12.1.el8_4.x86_64: OK\n",
       "Linux perf_event_open syscall available: OK\n",
       "Sampling trigger event available: OK\n",
-      "Intel(c) Last Branch Record support: Available\n",
+      "Intel(c) Last Branch Record support: Not Available\n",
       "Sampling Environment: OK\n"
      ]
     }
@@ -223,7 +213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -231,7 +221,11 @@
     "os.makedirs('./dataset/EN/32k', exist_ok=True)\n",
     "os.makedirs('./dataset/EN/50k', exist_ok=True)\n",
     "os.makedirs('./dataset/SV/32k', exist_ok=True)\n",
-    "os.makedirs('./dataset/SV/56k', exist_ok=True)"
+    "os.makedirs('./dataset/SV/56k', exist_ok=True)\n",
+    "os.makedirs('./sv_ckpt/', exist_ok=True)\n",
+    "os.makedirs('./profiles/naive', exist_ok=True)\n",
+    "os.makedirs('./profiles/2ndrun', exist_ok=True)\n",
+    "os.makedirs('./profiles/SV', exist_ok=True)"
    ]
   },
   {
@@ -247,6 +241,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "---\n",
     "### Tutorial Outline\n",
     "\n",
     "The following contents will be covered during the Bootcamp :\n",

+ 165 - 26
ai/Megatron/English/Python/jupyter_notebook/Day2-3_GPT_vocab_merge_files.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "valid-smoke",
+   "id": "maritime-macro",
    "metadata": {},
    "source": [
     "# \n",
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "novel-crash",
+   "id": "compliant-champion",
    "metadata": {},
    "source": [
     "#### let's review the source code of [gpt2 tokenizer](https://huggingface.co/transformers/_modules/transformers/tokenization_gpt2.html)\n",
@@ -46,20 +46,147 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "enhanced-vehicle",
+   "execution_count": 1,
+   "id": "thrown-aurora",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Requirement already satisfied: tokenizers in /home/x_zench/.local/lib/python3.8/site-packages (0.10.3)\n",
+      "Requirement already satisfied: transformers in /home/x_zench/.local/lib/python3.8/site-packages (4.10.0)\n",
+      "Requirement already satisfied: ipywidgets in /home/x_zench/.local/lib/python3.8/site-packages (7.6.4)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.4.1)\n",
+      "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers) (0.0.35)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.8/site-packages (from transformers) (2.24.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2021.3.17)\n",
+      "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from transformers) (20.9)\n",
+      "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (1.19.2)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.8/site-packages (from transformers) (4.53.0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.0.12 in /home/x_zench/.local/lib/python3.8/site-packages (from transformers) (0.0.16)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n",
+      "Requirement already satisfied: jupyterlab-widgets>=1.0.0; python_version >= \"3.6\" in /home/x_zench/.local/lib/python3.8/site-packages (from ipywidgets) (1.0.1)\n",
+      "Requirement already satisfied: ipykernel>=4.5.1 in /opt/conda/lib/python3.8/site-packages (from ipywidgets) (5.5.0)\n",
+      "Requirement already satisfied: ipython-genutils~=0.2.0 in /opt/conda/lib/python3.8/site-packages (from ipywidgets) (0.2.0)\n",
+      "Requirement already satisfied: nbformat>=4.2.0 in /opt/conda/lib/python3.8/site-packages (from ipywidgets) (5.1.2)\n",
+      "Requirement already satisfied: traitlets>=4.3.1 in /opt/conda/lib/python3.8/site-packages (from ipywidgets) (5.0.5)\n",
+      "Requirement already satisfied: widgetsnbextension~=3.5.0 in /home/x_zench/.local/lib/python3.8/site-packages (from ipywidgets) (3.5.1)\n",
+      "Requirement already satisfied: ipython>=4.0.0; python_version >= \"3.3\" in /opt/conda/lib/python3.8/site-packages (from ipywidgets) (7.21.0)\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (7.1.2)\n",
+      "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.0.1)\n",
+      "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.15.0)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (3.0.4)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (1.25.11)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2020.12.5)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2.10)\n",
+      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->transformers) (2.4.7)\n",
+      "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.8/site-packages (from huggingface-hub>=0.0.12->transformers) (3.7.4.3)\n",
+      "Requirement already satisfied: tornado>=4.2 in /opt/conda/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)\n",
+      "Requirement already satisfied: jupyter-client in /opt/conda/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1.12)\n",
+      "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /opt/conda/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets) (3.0.2)\n",
+      "Requirement already satisfied: jupyter-core in /opt/conda/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets) (4.7.1)\n",
+      "Requirement already satisfied: notebook>=4.4.1 in /opt/conda/lib/python3.8/site-packages (from widgetsnbextension~=3.5.0->ipywidgets) (6.2.0)\n",
+      "Requirement already satisfied: decorator in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (4.4.2)\n",
+      "Requirement already satisfied: pygments in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (2.8.1)\n",
+      "Requirement already satisfied: pexpect>4.3; sys_platform != \"win32\" in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (4.8.0)\n",
+      "Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.17.0)\n",
+      "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (3.0.8)\n",
+      "Requirement already satisfied: pickleshare in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.7.5)\n",
+      "Requirement already satisfied: backcall in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.2.0)\n",
+      "Requirement already satisfied: setuptools>=18.5 in /opt/conda/lib/python3.8/site-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (50.3.1.post20201107)\n",
+      "Requirement already satisfied: pyzmq>=13 in /opt/conda/lib/python3.8/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (22.0.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.8/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.1)\n",
+      "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (20.3.0)\n",
+      "Requirement already satisfied: pyrsistent>=0.14.0 in /opt/conda/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (0.17.3)\n",
+      "Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (20.1.0)\n",
+      "Requirement already satisfied: nbconvert in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (6.0.7)\n",
+      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.11.3)\n",
+      "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.9.0)\n",
+      "Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.9.3)\n",
+      "Requirement already satisfied: Send2Trash>=1.5.0 in /opt/conda/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.0)\n",
+      "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.8/site-packages (from pexpect>4.3; sys_platform != \"win32\"->ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.7.0)\n",
+      "Requirement already satisfied: parso>=0.7.0 in /opt/conda/lib/python3.8/site-packages (from jedi>=0.16->ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.8.1)\n",
+      "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets) (0.2.5)\n",
+      "Requirement already satisfied: cffi>=1.0.0 in /opt/conda/lib/python3.8/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.14.3)\n",
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.7.1)\n",
+      "Requirement already satisfied: entrypoints>=0.2.2 in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.3)\n",
+      "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.4.3)\n",
+      "Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.1.2)\n",
+      "Requirement already satisfied: testpath in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.4.4)\n",
+      "Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.3)\n",
+      "Requirement already satisfied: mistune<2,>=0.8.1 in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.8.4)\n",
+      "Requirement already satisfied: bleach in /opt/conda/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (3.3.0)\n",
+      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/lib/python3.8/site-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.1.1)\n",
+      "Requirement already satisfied: pycparser in /opt/conda/lib/python3.8/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.20)\n",
+      "Requirement already satisfied: nest-asyncio in /opt/conda/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.1)\n",
+      "Requirement already satisfied: async-generator in /opt/conda/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.10)\n",
+      "Requirement already satisfied: webencodings in /opt/conda/lib/python3.8/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.1)\n"
+     ]
+    }
+   ],
    "source": [
     "!pip install tokenizers  transformers ipywidgets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "consolidated-substance",
+   "execution_count": 2,
+   "id": "critical-apparel",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2021-09-15 09:29:57--  https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json\n",
+      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.95.125\n",
+      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.95.125|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1042301 (1018K) [application/json]\n",
+      "Saving to: ‘gpt2-vocab.json’\n",
+      "\n",
+      "gpt2-vocab.json     100%[===================>]   1018K  1.53MB/s    in 0.7s    \n",
+      "\n",
+      "2021-09-15 09:29:58 (1.53 MB/s) - ‘gpt2-vocab.json’ saved [1042301/1042301]\n",
+      "\n",
+      "--2021-09-15 09:29:58--  https://huggingface.co/openai-gpt/resolve/main/vocab.json\n",
+      "Resolving huggingface.co (huggingface.co)... 107.23.77.87, 34.200.164.230, 34.195.144.223, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|107.23.77.87|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 815973 (797K) [application/json]\n",
+      "Saving to: ‘vocab.json’\n",
+      "\n",
+      "vocab.json          100%[===================>] 796.85K  1.78MB/s    in 0.4s    \n",
+      "\n",
+      "2021-09-15 09:29:59 (1.78 MB/s) - ‘vocab.json’ saved [815973/815973]\n",
+      "\n",
+      "--2021-09-15 09:30:00--  https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt\n",
+      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.95.125\n",
+      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.95.125|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 456318 (446K) [text/plain]\n",
+      "Saving to: ‘gpt2-merges.txt’\n",
+      "\n",
+      "gpt2-merges.txt     100%[===================>] 445.62K  1.00MB/s    in 0.4s    \n",
+      "\n",
+      "2021-09-15 09:30:01 (1.00 MB/s) - ‘gpt2-merges.txt’ saved [456318/456318]\n",
+      "\n",
+      "--2021-09-15 09:30:01--  https://huggingface.co/openai-gpt/resolve/main/merges.txt\n",
+      "Resolving huggingface.co (huggingface.co)... 107.23.77.87, 34.200.164.230, 34.195.144.223, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|107.23.77.87|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 458495 (448K) [text/plain]\n",
+      "Saving to: ‘merges.txt’\n",
+      "\n",
+      "merges.txt          100%[===================>] 447.75K  1007KB/s    in 0.4s    \n",
+      "\n",
+      "2021-09-15 09:30:02 (1007 KB/s) - ‘merges.txt’ saved [458495/458495]\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "!wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json\n",
     "!wget https://huggingface.co/openai-gpt/resolve/main/vocab.json\n",
@@ -69,8 +196,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "copyrighted-present",
+   "execution_count": 3,
+   "id": "circular-covering",
    "metadata": {},
    "outputs": [
     {
@@ -88,7 +215,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "critical-specific",
+   "id": "liked-reach",
    "metadata": {},
    "source": [
     "## examine the vocab and merge files"
@@ -96,16 +223,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "chemical-ebony",
+   "execution_count": 4,
+   "id": "bored-standing",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "noted that the Ġ = space +256 to form that control letter\n",
-      "['Ġdegrees', 'Donald', 'Ġcentre', 'Ġsharing', 'Ġwinter', 'ĠCO', 'Che', 'ĠÎ', 'MP', 'Ġunw', 'Ġfewer', 'ĠMir', 'Ġsomewhere', 'ĠKey', 'Ġattacked', 'ĠKir', 'Ġdomain', 'Ġstronger', 'Ġ99', 'Ġpenalty']\n"
+      "noted that the Ġ = space + 256 to form that control letter\n",
+      "['Ġassorted', 'ĠRevision', 'ĠPiano', 'ĠGideon', 'Ocean', 'Ġsalon', 'Ġbustling', 'ognitive', 'ĠRahman', 'Ġwaiter', 'Ġpresets', 'ĠOsh', 'ĠGHC', 'operator', 'Ġreptiles', 'Ġ413', 'ĠGarr', 'ĠChak', 'Ġhashes', 'Ġfailings']\n"
      ]
     }
    ],
@@ -122,8 +249,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "mediterranean-outreach",
+   "execution_count": 5,
+   "id": "driven-coaching",
    "metadata": {},
    "outputs": [
     {
@@ -144,7 +271,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "controversial-albuquerque",
+   "id": "celtic-cheese",
    "metadata": {},
    "source": [
     "## sanity check load from transformer GPT2Tokenizer "
@@ -152,8 +279,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "classical-stretch",
+   "execution_count": 6,
+   "id": "handled-cooper",
    "metadata": {},
    "outputs": [
     {
@@ -186,8 +313,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "cordless-science",
+   "execution_count": 7,
+   "id": "electrical-performance",
    "metadata": {},
    "outputs": [
     {
@@ -220,8 +347,8 @@
     "        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()\n",
     "        tokenizer.decoder = ByteLevelDecoder()\n",
     "    return tokenizer , vocab\n",
-    "vocab_file='./Megatron-LM/gpt2-vocab.json'\n",
-    "merge_file='./Megatron-LM/gpt2-merges.txt'\n",
+    "vocab_file='./gpt2-vocab.json'\n",
+    "merge_file='./gpt2-merges.txt'\n",
     "tokenizers_gpt,_=load_tokenizer(vocab_file,merge_file,True)\n",
     "sample_text=' Hello world' \n",
     "output=tokenizers_gpt.encode(sample_text)\n",
@@ -244,8 +371,20 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "funny-scheduling",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## clean up\n",
+    "!rm merges.txt\n",
+    "!rm vocab.json"
+   ]
+  },
+  {
    "cell_type": "markdown",
-   "id": "temporal-latest",
+   "id": "placed-necessity",
    "metadata": {},
    "source": [
     "---\n",
@@ -259,7 +398,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "educational-ecology",
+   "id": "respected-class",
    "metadata": {},
    "source": [
     "-----\n",

+ 60 - 29
ai/Megatron/English/Python/jupyter_notebook/Day2-4_jsonfy_and_process2mmap.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "convertible-whale",
+   "id": "dependent-chemistry",
    "metadata": {},
    "source": [
     "# \n",
@@ -23,8 +23,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "surrounded-counter",
+   "execution_count": 1,
+   "id": "square-louisville",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,15 +35,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "complete-lindsay",
+   "execution_count": 2,
+   "id": "human-appliance",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.07 ms ± 55.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "3.84 ms ± 36.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -54,15 +54,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "conventional-mason",
+   "execution_count": 3,
+   "id": "heard-baseball",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "62 µs ± 136 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+      "43 µs ± 120 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
      ]
     }
    ],
@@ -73,8 +73,19 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "dynamic-nudist",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## clean up\n",
+    "!rm myarr.npy"
+   ]
+  },
+  {
    "cell_type": "markdown",
-   "id": "functioning-stage",
+   "id": "soviet-jumping",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -86,7 +97,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "eastern-habitat",
+   "id": "acting-covering",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -102,25 +113,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "finite-marina",
+   "execution_count": 5,
+   "id": "organic-malaysia",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "finished processing 74 lines to loose json format\n"
+      "finished processing 71 lines to loose json format\n"
      ]
     }
    ],
    "source": [
-    "!python create_loose_json.py --infile ./Megatron-LM/dataset/EN/extractedNVblogs.txt --outfile ./Megatron-LM/dataset/EN/extractedNVblogs.json"
+    "!python create_loose_json.py --infile ../dataset/EN/extractedNVblogs.txt --outfile ../dataset/EN/extractedNVblogs.json"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "proof-pakistan",
+   "id": "rubber-absolute",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -155,7 +166,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "angry-canvas",
+   "id": "lined-transfer",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -191,21 +202,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "regional-stake",
+   "execution_count": 6,
+   "id": "marked-midnight",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gpt2-merges.txt  gpt2-vocab.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mv gpt2-vocab.json ../dataset/EN/50k/\n",
+    "!mv gpt2-merges.txt ../dataset/EN/50k/\n",
+    "!ls ../dataset/EN/50k/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "adjustable-hammer",
    "metadata": {},
    "outputs": [],
    "source": [
     "INPUT_JSON_FILE='../dataset/EN/extractedNVblogs.json'\n",
-    "OUTPUT_PATH='../dataset/EN/CustomSentenceSplitter'\n",
+    "OUTPUT_PATH='../dataset/EN/NVblog'\n",
     "VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'\n",
     "MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'\n",
-    "NUM_CPUS=16\n"
+    "NUM_CPUS=16"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "similar-commonwealth",
+   "id": "hidden-patrick",
    "metadata": {},
    "source": [
     "---\n",
@@ -239,15 +270,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "framed-point",
+   "execution_count": 10,
+   "id": "professional-lawyer",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Opening ./Megatron-LM/dataset/EN/extractedNVblogs.json\n",
+      "Opening ../dataset/EN/extractedNVblogs.json\n",
       "> building GPT2BPETokenizer tokenizer ...\n",
       " > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)\n",
       "> building GPT2BPETokenizer tokenizer ...\n",
@@ -267,8 +298,8 @@
       "> building GPT2BPETokenizer tokenizer ...\n",
       "> building GPT2BPETokenizer tokenizer ...\n",
       "Vocab size: 50257\n",
-      "Output prefix: ./Megatron-LM/dataset/EN/NVblogs\n",
-      "Time to startup: 0.5460700988769531\n",
+      "Output prefix: ../dataset/EN/NVblog\n",
+      "Time to startup: 0.1618051528930664\n",
       " > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)\n",
       " > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)\n",
       " > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)\n",
@@ -303,7 +334,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "endless-vietnamese",
+   "id": "valuable-equilibrium",
    "metadata": {},
    "source": [
     "---\n",
@@ -317,7 +348,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "married-necklace",
+   "id": "accurate-drinking",
    "metadata": {},
    "source": [
     "-----\n",

Разница между файлами не показана из-за своего большого размера
+ 341 - 265
ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb


Разница между файлами не показана из-за своего большого размера
+ 322 - 559
ai/Megatron/English/Python/jupyter_notebook/Day3-3_train_own_GPT2BPETokenizer.ipynb


Разница между файлами не показана из-за своего большого размера
+ 12622 - 20
ai/Megatron/English/Python/jupyter_notebook/Day3-4_customize_process2mmap.ipynb


+ 155 - 114
ai/Megatron/English/Python/jupyter_notebook/Day3-5_run_Megatron_with_varying_config.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "boxed-privilege",
+   "id": "charged-allen",
    "metadata": {},
    "source": [
     "# \n",
@@ -50,7 +50,31 @@
   },
   {
    "cell_type": "markdown",
-   "id": "royal-holiday",
+   "id": "continuing-passport",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "# Hint :\n",
+    "### call out a terminal and type in **nvidia-smi** to monitor the GPUs' utils and power consumption \n",
+    "### remember to fill up the GPU memory\n",
+    "![call out a terminal ](./Megatron-LM/pics/Alt_callout2terminals.JPG)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "corrected-bacteria",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## modify and rerun the below to get a even bigger GPT model \n",
+    "<a id=\"MODIFY_CELL\"></a>\n",
+    "\n",
+    "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#Rerun_Cell\">Jump to ReRun Cell</a> "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dramatic-opinion",
    "metadata": {},
    "source": [
     "<a id=\"Rerun_Cell\"></a>"
@@ -58,18 +82,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "id": "opening-description",
+   "execution_count": 1,
+   "id": "massive-industry",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!rm -fr ./Megatron-LM/sv_ckpt/* "
+    "!rm -fr ../sv_ckpt/* "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "future-explorer",
+   "execution_count": 2,
+   "id": "understood-swimming",
    "metadata": {},
    "outputs": [
     {
@@ -89,22 +113,22 @@
     "NODE_RANK=0\n",
     "\n",
     "### modify this section to point the file to its own path \n",
-    "CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'\n",
-    "DATA_PATH='../dataset/SV/webnyheter2013_text_document'\n",
-    "VOCAB_FILE='../dataset/SV/32k/vocab.json'\n",
-    "MERGE_FILE='../dataset/SV/32k/merges.txt'\n",
-    "PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path\n",
+    "CHECKPOINT_PATH='../sv_ckpt/'\n",
+    "DATA_PATH='../dataset/SV/webnyheter2013_56kvocab_text_document'\n",
+    "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n",
+    "MERGE_FILE='../dataset/SV/56k/merges.txt'\n",
+    "PROFILE_OUTPUT_PATH='../profiles/SV/nsys_sv_' # modify this to your own profile path\n",
     "\n",
     "#### [TODO]--------------- params in the following block are allowed to change -----------#### \n",
-    "WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system\n",
-    "GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system\n",
+    "WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
+    "GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
     "\n",
-    "TENSOR_MP_SIZE=8\n",
+    "TENSOR_MP_SIZE=2\n",
     "PIPELINE_MP_SIZE=1\n",
-    "LAYERS=64\n",
-    "HIDDEN_SZ=2048\n",
+    "LAYERS=32\n",
+    "HIDDEN_SZ=4096\n",
     "NUM_ATTN_HEADS=32\n",
-    "MICRO_BZ=64\n",
+    "MICRO_BZ=8\n",
     "GLOBAL_BZ=512\n",
     "SEQ_LEN=512\n",
     "MAX_POS_EM=512\n",
@@ -151,26 +175,27 @@
   },
   {
    "cell_type": "markdown",
-   "id": "confident-prerequisite",
+   "id": "monetary-trial",
    "metadata": {},
    "source": [
     "---\n",
     "## check how big is your model - \n",
-    "I got 1 Billion :)  what about you ?"
+    "modify the parameters in the [params_cnt.sh](./params_cnt.sh)\n",
+    "I got 6 Billion :)  what about you ?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "id": "affecting-function",
+   "execution_count": 3,
+   "id": "afraid-promise",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3\n",
-      "3289513984\n"
+      "6\n",
+      "6675628032\n"
      ]
     }
    ],
@@ -180,7 +205,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "hairy-dominican",
+   "id": "portuguese-freedom",
    "metadata": {},
    "source": [
     "---\n",
@@ -203,48 +228,37 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "competent-romania",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "# Re-run this cell below to get an even bigger GPT model\n",
+    "## remember to modify the [params count](./params_cnt.sh) to check how big is your model\n",
+    "## click the below to go back to Modify the profile_SVGPT_BIG.sh \n",
+    "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#MODIFY_CELL\">Jump back to modify and overwrite profile_SVGPT_BIG.sh </a> \n",
+    "<a id=\"Rerun_Cell\"></a>"
+   ]
+  },
+  {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "acknowledged-brake",
+   "execution_count": 4,
+   "id": "injured-pasta",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
+      "Initializing NVTX monkey patchesInitializing NVTX monkey patches\n",
+      "\n",
       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "Done with NVTX monkey patching\n",
       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Done with NVTX monkey patching\n",
-      "using world size: 8, data-parallel-size: 1, tensor-model-parallel size: 8, pipeline-model-parallel size: 1 \n",
+      "Done with NVTX monkey patchingDone with NVTX monkey patching\n",
+      "\n",
+      "using world size: 2, data-parallel-size: 1, tensor-model-parallel size: 2, pipeline-model-parallel size: 1 \n",
       "using torch.float16 for parameters ...\n",
       "------------------------ arguments ------------------------\n",
       "  accumulate_allreduce_grads_in_fp32 .............. False\n",
@@ -272,7 +286,7 @@
       "  consumed_valid_samples .......................... 0\n",
       "  data_impl ....................................... mmap\n",
       "  data_parallel_size .............................. 1\n",
-      "  data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_text_document']\n",
+      "  data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_56kvocab_text_document']\n",
       "  dataloader_type ................................. single\n",
       "  DDP_impl ........................................ local\n",
       "  decoder_seq_length .............................. None\n",
@@ -286,14 +300,14 @@
       "  evidence_data_path .............................. None\n",
       "  exit_duration_in_mins ........................... None\n",
       "  exit_interval ................................... None\n",
-      "  ffn_hidden_size ................................. 8192\n",
+      "  ffn_hidden_size ................................. 16384\n",
       "  finetune ........................................ False\n",
       "  fp16 ............................................ True\n",
       "  fp16_lm_cross_entropy ........................... False\n",
       "  fp32_residual_connection ........................ False\n",
       "  global_batch_size ............................... 512\n",
       "  hidden_dropout .................................. 0.1\n",
-      "  hidden_size ..................................... 2048\n",
+      "  hidden_size ..................................... 4096\n",
       "  hysteresis ...................................... 2\n",
       "  ict_head_size ................................... None\n",
       "  ict_load ........................................ None\n",
@@ -303,10 +317,10 @@
       "  init_method_std ................................. 0.02\n",
       "  init_method_xavier_uniform ...................... False\n",
       "  initial_loss_scale .............................. 4294967296\n",
-      "  kv_channels ..................................... 64\n",
+      "  kv_channels ..................................... 128\n",
       "  layernorm_epsilon ............................... 1e-05\n",
       "  lazy_mpu_init ................................... None\n",
-      "  load ............................................ ./Megatron-LM/sv_ckpt/\n",
+      "  load ............................................ ../sv_ckpt/\n",
       "  local_rank ...................................... 0\n",
       "  log_batch_size_to_tensorboard ................... False\n",
       "  log_interval .................................... 10\n",
@@ -329,8 +343,8 @@
       "  mask_prob ....................................... 0.15\n",
       "  masked_softmax_fusion ........................... True\n",
       "  max_position_embeddings ......................... 512\n",
-      "  merge_file ...................................... ../dataset/SV/32k/merges.txt\n",
-      "  micro_batch_size ................................ 64\n",
+      "  merge_file ...................................... ../dataset/SV/56k/merges.txt\n",
+      "  micro_batch_size ................................ 8\n",
       "  min_loss_scale .................................. 1.0\n",
       "  min_lr .......................................... 1e-05\n",
       "  mmap_warmup ..................................... False\n",
@@ -341,7 +355,7 @@
       "  num_attention_heads ............................. 32\n",
       "  num_channels .................................... 3\n",
       "  num_classes ..................................... 1000\n",
-      "  num_layers ...................................... 64\n",
+      "  num_layers ...................................... 32\n",
       "  num_layers_per_virtual_pipeline_stage ........... None\n",
       "  num_workers ..................................... 2\n",
       "  onnx_safe ....................................... None\n",
@@ -360,7 +374,7 @@
       "  retriever_score_scaling ......................... False\n",
       "  retriever_seq_length ............................ 256\n",
       "  sample_rate ..................................... 1.0\n",
-      "  save ............................................ ./Megatron-LM/sv_ckpt/\n",
+      "  save ............................................ ../sv_ckpt/\n",
       "  save_interval ................................... 100\n",
       "  scatter_gather_tensors_in_pipeline .............. True\n",
       "  seed ............................................ 1234\n",
@@ -368,7 +382,7 @@
       "  sgd_momentum .................................... 0.9\n",
       "  short_seq_prob .................................. 0.1\n",
       "  split ........................................... 949,50,1\n",
-      "  tensor_model_parallel_size ...................... 8\n",
+      "  tensor_model_parallel_size ...................... 2\n",
       "  tensorboard_dir ................................. None\n",
       "  tensorboard_log_interval ........................ 1\n",
       "  tensorboard_queue_size .......................... 1000\n",
@@ -382,60 +396,54 @@
       "  use_one_sent_docs ............................... False\n",
       "  virtual_pipeline_model_parallel_size ............ None\n",
       "  vocab_extra_ids ................................. 0\n",
-      "  vocab_file ...................................... ../dataset/SV/32k/vocab.json\n",
+      "  vocab_file ...................................... ../dataset/SV/56k/vocab.json\n",
       "  weight_decay .................................... 0.01\n",
-      "  world_size ...................................... 8\n",
+      "  world_size ...................................... 2\n",
       "-------------------- end of arguments ---------------------\n",
-      "setting number of micro-batches to constant 8\n",
+      "setting number of micro-batches to constant 64\n",
       "> building GPT2BPETokenizer tokenizer ...\n",
-      " > padded vocab (size: 32000) with 768 dummy tokens (new size: 32768)\n",
+      " > padded vocab (size: 56000) with 64 dummy tokens (new size: 56064)\n",
       "> initializing torch distributed ...\n",
-      "> initializing tensor model parallel with size 8\n",
+      "> initializing tensor model parallel with size 2\n",
       "> initializing pipeline model parallel with size 1\n",
       "> setting random seeds to 1234 ...\n",
       "> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234\n",
       "> compiling dataset index builder ...\n",
-      "make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
+      "make: Entering directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n",
       "make: Nothing to be done for 'default'.\n",
-      "make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
-      ">>> done with dataset index builder. Compilation time: 0.167 seconds\n",
+      "make: Leaving directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n",
+      ">>> done with dataset index builder. Compilation time: 0.145 seconds\n",
       "> compiling and loading fused kernels ...\n",
       "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
       "Building extension module scaled_upper_triang_masked_softmax_cuda...\n",
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module scaled_upper_triang_masked_softmax_cuda...\n",
       "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
       "Building extension module scaled_masked_softmax_cuda...\n",
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module scaled_masked_softmax_cuda...\n",
       "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
       "Building extension module fused_mix_prec_layer_norm_cuda...\n",
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_mix_prec_layer_norm_cuda...\n",
-      ">>> done with compiling and loading fused kernels. Compilation time: 18.065 seconds\n",
-      "time to initialize megatron (seconds): 90.261\n",
-      "[after megatron is initialized] datetime: 2021-08-30 08:59:22 \n",
+      ">>> done with compiling and loading fused kernels. Compilation time: 2.868 seconds\n",
+      "time to initialize megatron (seconds): 43.936\n",
+      "[after megatron is initialized] datetime: 2021-09-15 11:55:55 \n",
       "building GPT model ...\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 412995584\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 412995584\n",
+      " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 3339395072\n",
+      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 3339395072\n",
       "setting training iterations to 0\n",
       "> learning rate decay style: cosine\n",
-      "WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n",
+      "WARNING: could not find the metadata file ../sv_ckpt/latest_checkpointed_iteration.txt \n",
       "    will not load any checkpoints and will start from random\n",
-      "time (ms) | load-checkpoint: 25.10\n",
-      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 08:59:28 \n",
+      "time (ms) | load-checkpoint: 2.66\n",
+      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-09-15 11:55:56 \n",
       "> building train, validation, and test datasets ...\n",
       " > datasets target sizes (minimum size):\n",
       "    train:      100\n",
@@ -448,7 +456,7 @@
       "    reading document index...\n",
       "    creating numpy buffer of mmap...\n",
       "    creating memory view of numpy buffer...\n",
-      " > finished creating indexed dataset in 0.004143 seconds\n",
+      " > finished creating indexed dataset in 0.004941 seconds\n",
       "    number of documents: 1249010\n",
       " > dataset split:\n",
       "    train:\n",
@@ -457,24 +465,57 @@
       "     document indices in [1185311, 1247761) total of 62450 documents\n",
       "    test:\n",
       "     document indices in [1247761, 1249010) total of 1249 documents\n",
-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
+      " > only one epoch required, setting separate_last_epoch to False\n",
+      " > elasped time to build and save doc-idx mapping (seconds): 0.066494\n",
+      "    using:\n",
+      "     number of documents:       1185311\n",
+      "     number of epochs:          1\n",
+      "     sequence length:           512\n",
+      "     total number of samples:   51303\n",
+      " > elasped time to build and save sample-idx mapping (seconds): 0.008808\n",
+      " > building shuffle index with split [0, 51303) and [51303, 51303) ...\n",
+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.002738\n",
+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
       "    loaded indexed file in 0.005 seconds\n",
-      "    total number of samples: 53948\n",
+      "    total number of samples: 51304\n",
       "    total number of epochs: 1\n",
-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
-      "    loaded indexed file in 0.003 seconds\n",
-      "    total number of samples: 5695\n",
+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
+      " > last epoch number of samples (2438) is larger than 80% of number of samples per epoch (2708), setting separate_last_epoch to False\n",
+      " > elasped time to build and save doc-idx mapping (seconds): 0.005265\n",
+      "    using:\n",
+      "     number of documents:       62450\n",
+      "     number of epochs:          2\n",
+      "     sequence length:           512\n",
+      "     total number of samples:   5416\n",
+      " > elasped time to build and save sample-idx mapping (seconds): 0.001357\n",
+      " > building shuffle index with split [0, 5416) and [5416, 5416) ...\n",
+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.002597\n",
+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
+      "    loaded indexed file in 0.002 seconds\n",
+      "    total number of samples: 5417\n",
       "    total number of epochs: 2\n",
-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
-      "    loaded indexed file in 0.003 seconds\n",
-      "    total number of samples: 5192\n",
-      "    total number of epochs: 91\n",
+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
+      " > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (54), setting separate_last_epoch to True\n",
+      " > elasped time to build and save doc-idx mapping (seconds): 0.004714\n",
+      "    using:\n",
+      "     number of documents:       1249\n",
+      "     number of epochs:          96\n",
+      "     sequence length:           512\n",
+      "     total number of samples:   5188\n",
+      " > elasped time to build and save sample-idx mapping (seconds): 0.001624\n",
+      " > building shuffle index with split [0, 5134) and [5134, 5188) ...\n",
+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.001298\n",
+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
+      "    loaded indexed file in 0.002 seconds\n",
+      "    total number of samples: 5189\n",
+      "    total number of epochs: 96\n",
       "> building indices for blendable datasets ...\n",
       " > sample ratios:\n",
       "   dataset 0, input: 1, achieved: 1\n",
@@ -488,17 +529,17 @@
       "   dataset 0, input: 1, achieved: 1\n",
       "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
       "> finished creating GPT datasets ...\n",
-      "[after dataloaders are built] datetime: 2021-08-30 08:59:32 \n",
+      "[after dataloaders are built] datetime: 2021-09-15 11:55:58 \n",
       "done with setup ...\n",
       "training ...\n",
-      "time (ms) | model-and-optimizer-setup: 6065.80 | train/valid/test-data-iterators-setup: 2661.91\n",
-      "[after training is done] datetime: 2021-08-30 08:59:32 \n",
+      "time (ms) | model-and-optimizer-setup: 929.42 | train/valid/test-data-iterators-setup: 1004.53\n",
+      "[after training is done] datetime: 2021-09-15 11:55:58 \n",
       "------------------------------------------------------------------------------------------------------------------\n",
-      " validation loss at the end of training for val data | lm loss value: 1.081321E+01 | lm loss PPL: 4.967259E+04 | \n",
+      " validation loss at the end of training for val data | lm loss value: 1.171452E+01 | lm loss PPL: 1.223352E+05 | \n",
       "------------------------------------------------------------------------------------------------------------------\n",
       "Evaluating iter 10/10\n",
       "-------------------------------------------------------------------------------------------------------------------\n",
-      " validation loss at the end of training for test data | lm loss value: 1.081394E+01 | lm loss PPL: 4.970880E+04 | \n",
+      " validation loss at the end of training for test data | lm loss value: 1.171400E+01 | lm loss PPL: 1.222719E+05 | \n",
       "-------------------------------------------------------------------------------------------------------------------\n"
      ]
     }
@@ -509,7 +550,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "determined-right",
+   "id": "entertaining-transparency",
    "metadata": {},
    "source": [
     "## Remember to copy and paste your output on Slack or Zoom\n",
@@ -518,7 +559,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "searching-worthy",
+   "id": "hidden-minister",
    "metadata": {},
    "source": [
     "-----\n",

+ 0 - 126
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/Dlprof_pretrain_gpt.py

@@ -1,126 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Pretrain GPT"""
-
-import torch
-from functools import partial
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_timers
-from megatron import get_tokenizer
-from megatron import mpu
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.model import GPTModel
-from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
-from megatron.utils import average_losses_across_data_parallel_group
-import pyprof
-pyprof.init(enable_function_stack=True)
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(
-        num_tokentypes=0,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process
-    )
-    return model
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Items and their type.
-    keys = ['text']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = mpu.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    return tokens, labels, loss_mask, attention_mask, position_ids
-
-def loss_func(loss_mask, output_tensor):
-    losses = output_tensor.float()
-    loss_mask = loss_mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-
-    return loss, {'lm loss': averaged_loss[0]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator').start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator)
-    timers('batch-generator').stop()
-
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          labels=labels)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
-    print_rank_0("> finished creating GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-if __name__ == "__main__":
-    with torch.autograd.profiler.emit_nvtx():
-        pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

+ 0 - 49
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/nsys_test.sh

@@ -1,49 +0,0 @@
-# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
-GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1 #<-- currently we are using 1 node multigpus
-NODE_RANK=0
-WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
-
-CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'
-DATA_PATH='../dataset/EN/NVblogs_text_document'
-VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
-MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
-PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/naive/' # modify this to your own profile path
-
-
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-nsys profile --stats=true --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt -o /home/zcharpy/profiles/GPT360M_naive \
-    python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-    ./Megatron-LM/pretrain_gpt.py \
-       --num-layers 16 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --micro-batch-size 1 \
-       --global-batch-size 8 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-samples 100 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file $VOCAB_FILE \
-       --merge-file $MERGE_FILE \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --checkpoint-activations \
-       --log-interval 10 \
-       --save-interval 100 \
-       --eval-interval 100 \
-       --eval-iters 10 

+ 5 - 5
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_2nd_run.sh

@@ -9,11 +9,11 @@ WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in y
 TENSOR_MP_SIZE=8
 PIPELINE_MP_SIZE=1
 ### modify this section to point the file to its own path 
-CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'
-DATA_PATH='../dataset/EN/NVblogs_text_document'
-VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
-MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
-PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path
+CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it 
+DATA_PATH='../dataset/EN/NVblog_text_document' ## modify this path if you customize it 
+VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json' ## modify this path if you customize it 
+MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt' ## modify this path if you customize it 
+PROFILE_OUTPUT_PATH='../profiles/2ndrun/nsys_improved' # modify this to your own profile path
 
 export OMP_NUM_THREADS=1
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

+ 11 - 11
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_SVGPT_BIG.sh

@@ -5,22 +5,22 @@ NNODES=1 #<-- currently we are using 1 node multigpus
 NODE_RANK=0
 
 ### modify this section to point the file to its own path 
-CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'
-DATA_PATH='../dataset/EN/NVblogs_text_document'
-VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
-MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
-PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path
+CHECKPOINT_PATH='../sv_ckpt/'
+DATA_PATH='../dataset/SV/webnyheter2013_56kvocab_text_document'
+VOCAB_FILE='../dataset/SV/56k/vocab.json'
+MERGE_FILE='../dataset/SV/56k/merges.txt'
+PROFILE_OUTPUT_PATH='../profiles/SV/nsys_sv_' # modify this to your own profile path
 
 #### [TODO]--------------- params in the following block are allowed to change -----------#### 
-WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
-GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
+WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system
+GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system
 
-TENSOR_MP_SIZE=8
+TENSOR_MP_SIZE=2
 PIPELINE_MP_SIZE=1
 LAYERS=32
-HIDDEN_SZ=2048
+HIDDEN_SZ=4096
 NUM_ATTN_HEADS=32
-MICRO_BZ=64
+MICRO_BZ=8
 GLOBAL_BZ=512
 SEQ_LEN=512
 MAX_POS_EM=512
@@ -30,7 +30,7 @@ export OMP_NUM_THREADS=1
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
 ## for nsys run
-nsys profile --stats=false --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt,nvtx -o $PROFILE_OUTPUT_PATH \
+#nsys profile --stats=false --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt,nvtx -o $PROFILE_OUTPUT_PATH \
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
     ./Megatron-LM/Dlprof_pretrain_gpt.py \
        --tensor-model-parallel-size $TENSOR_MP_SIZE \

+ 7 - 7
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_naive_run.sh

@@ -1,17 +1,17 @@
 # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
-GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
+GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1 #<-- currently we are using 1 node multigpus
 NODE_RANK=0
-WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
+WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system
 
-CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'
-DATA_PATH='../dataset/EN/NVblogs_text_document'
-VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
-MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
-PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/naive/nsys_naive' # modify this to your own profile path
+CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it 
+DATA_PATH='../dataset/EN/NVblog_text_document' ## modify this path if you customize it 
+VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json' ## modify this path if you customize it 
+MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt' ## modify this path if you customize it 
+PROFILE_OUTPUT_PATH='../profiles/naive/nsys_naive' # modify this to your own profile path
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 

+ 0 - 76
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/resume_iter1M_SVpretrainGPT3_2.7B.sh

@@ -1,76 +0,0 @@
-#!/bin/bash 
-####### not working need tweaking
-EXP_NAME="SwedishGPT3_2.7B_OriginalMegatron"
- # ngc args
-INSTANCE="dgx1v.32g.8.norm"
-IMAGE="nvcr.io/nvidia/pytorch:20.11-py3"
-# wandb args
-PROJECT_NAME=SwedishGPT3_2.7B_OriginalMegatron
-# megatron-lm args
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$((${GPUS_PER_NODE}*${NNODES}))
-DATA_PATH=/raid/SV_CC100Sprakbank_text_document
-CHECKPOINT_PATH=/result
-VOCAB_FILE=/mnt/dataset/32k/vocab.json
-MERGE_FILE=/mnt/dataset/32k/merges.txt
-
-MP_SIZE=8
-DISTRIBUTED_ARGS="--nproc_per_node ${GPUS_PER_NODE} --nnodes ${NNODES} --node_rank ${NODE_RANK} --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT}"
-GPT_ARGS="--num-layers 32 \
-           --hidden-size 2560 \
-           --num-attention-heads 32 \
-           --seq-length 512 \
-           --max-position-embeddings 1024 \
-           --lr 0.00015 \
-           --train-iters 5000000 \
-           --min-lr 0.00001 \
-           --lr-decay-iters 990000 \
-           --lr-warmup-fraction 0.01 \
-           --override-lr-scheduler \
-           --micro-batch-size 2 \
-           --vocab-file ${VOCAB_FILE} \
-           --merge-file ${MERGE_FILE} \
-           --split 949,50,1 \
-           --distributed-backend nccl \
-           --fp16"
-
-OUTPUT_ARGS="--log-interval 10000 \
-             --save-interval 500000 \
-             --eval-interval 500000 \
-             --eval-iters 100000 \
-             --checkpoint-activations"
-CMD="python -m torch.distributed.launch ${DISTRIBUTED_ARGS} \
-    pretrain_gpt.py \
-        --tensor-model-parallel-size 2 \
-        --pipeline-model-parallel-size 2 \
-        ${GPT_ARGS} \
-        ${OUTPUT_ARGS} \
-        --save ${CHECKPOINT_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --data-path ${DATA_PATH}
-        --tensorboard-dir ${CHECKPOINT_PATH} "
-echo "${CMD}"
-ngc batch run \
---name ${EXP_NAME} --preempt RUNONCE --ace nv-us-west-2 \
---instance ${INSTANCE} \
---commandline "nvidia-smi && \
-cp -r /mnt/dataset/32k /raid && \
-cp /mnt/dataset/SV_CC100Sprakbank_text_document.bin /raid/ && \
-cp /mnt/dataset/SV_CC100Sprakbank_text_document.idx /raid/ && \
-cp -r /mnt/ckpt/iter_1000000 /result && \
-cp /mnt/ckpt/latest_checkpointed_iteration.txt /result && \
-ls /raid && \
-git clone https://github.com/NVIDIA/Megatron-LM.git && \
-cd Megatron-LM/ && \
-${CMD}" \
---result /result \
---image ${IMAGE} \
---org nvidian \
---datasetid 80889:/mnt/dataset \
---datasetid 84035:/mnt/ckpt \
---port 6006

+ 1 - 0
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/sv_utils/trainGPTTokenizer.py

@@ -1,3 +1,4 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
 # -*- coding: utf-8 -*-
 import os , sys
 from tokenizers import Tokenizer

Разница между файлами не показана из-за своего большого размера
+ 0 - 670
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-(option)Website_scrapping.ipynb


+ 31 - 40
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-1_acquiring_data.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "vital-advertising",
+   "id": "ahead-surrey",
    "metadata": {},
    "source": [
     "# \n",
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "permanent-reception",
+   "id": "exterior-avatar",
    "metadata": {},
    "source": [
     "--------------------------------------------------------------------------------------------------------------------\n",
@@ -33,28 +33,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "steady-henry",
+   "execution_count": 1,
+   "id": "yellow-happening",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2021-08-27 02:36:06--  http://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2\n",
+      "--2021-09-15 10:33:55--  http://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2\n",
       "Resolving spraakbanken.gu.se (spraakbanken.gu.se)... 130.241.42.13\n",
       "Connecting to spraakbanken.gu.se (spraakbanken.gu.se)|130.241.42.13|:80... connected.\n",
       "HTTP request sent, awaiting response... 301 Moved Permanently\n",
       "Location: https://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2 [following]\n",
-      "--2021-08-27 02:36:06--  https://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2\n",
+      "--2021-09-15 10:33:55--  https://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2\n",
       "Connecting to spraakbanken.gu.se (spraakbanken.gu.se)|130.241.42.13|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 464382665 (443M) [application/x-bzip2]\n",
       "Saving to: ‘webbnyheter2013.xml.bz2’\n",
       "\n",
-      "webbnyheter2013.xml 100%[===================>] 442.87M  16.9MB/s    in 28s     \n",
+      "webbnyheter2013.xml 100%[===================>] 442.87M   110MB/s    in 4.1s    \n",
       "\n",
-      "2021-08-27 02:36:35 (16.0 MB/s) - ‘webbnyheter2013.xml.bz2’ saved [464382665/464382665]\n",
+      "2021-09-15 10:33:59 (109 MB/s) - ‘webbnyheter2013.xml.bz2’ saved [464382665/464382665]\n",
       "\n"
      ]
     }
@@ -65,26 +65,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "exposed-mouth",
+   "execution_count": 2,
+   "id": "happy-spectrum",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bunzip2: Input file ../../../../dataset/SV/ is a directory.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!bunzip2 -d webbnyheter2013.xml.bz2 "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "turned-navigator",
+   "execution_count": 3,
+   "id": "modular-helmet",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -93,15 +85,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "level-discipline",
+   "execution_count": 4,
+   "id": "roman-strap",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "56k  webbnyheter2013.xml\n"
+      "32k  56k  webbnyheter2013.xml\n"
      ]
     }
    ],
@@ -111,24 +103,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "separated-payday",
+   "execution_count": 5,
+   "id": "moderate-newfoundland",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2021-08-27 03:35:01--  https://raw.githubusercontent.com/spraakbanken/sb-nltk-tools/master/sb_corpus_reader.py\n",
-      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
-      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
+      "--2021-09-15 10:38:48--  https://raw.githubusercontent.com/spraakbanken/sb-nltk-tools/master/sb_corpus_reader.py\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 3065 (3.0K) [text/plain]\n",
       "Saving to: ‘sb_corpus_reader.py’\n",
       "\n",
-      "sb_corpus_reader.py 100%[===================>]   2.99K  --.-KB/s    in 0s      \n",
+      "sb_corpus_reader.py 100%[===================>]   2.99K  --.-KB/s    in 0.001s  \n",
       "\n",
-      "2021-08-27 03:35:01 (45.7 MB/s) - ‘sb_corpus_reader.py’ saved [3065/3065]\n",
+      "2021-09-15 10:38:49 (3.77 MB/s) - ‘sb_corpus_reader.py’ saved [3065/3065]\n",
       "\n"
      ]
     }
@@ -139,15 +131,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "guilty-comparative",
+   "execution_count": 6,
+   "id": "annoying-topic",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "fname:  .txt\n",
       "[['Telekombranschen', 'lyfts', 'av', 'en', 'större', 'europeisk', 'telekomaffär', ',', 'nederländska', 'KPN', 'säljer', 'tysk', 'verksamhet', 'för', 'omkring', 'åtta', 'miljarder', 'euro', ',', 'och', 'en', 'stark', 'rapport', 'från', 'Telenor', '.'], ['Denna', 'upprepade', 'process', 'är', 'död', 'nu', '\"', ',', 'skriver', '\"', 'Shield', '\"', '-', 'skaparen', 'Shawn', 'Ryan', ',', 'som', 'låg', 'bakom', 'idén', ',', 'på', 'Twitter', '.']]\n",
       "write to :  webnyheter2013.txt\n",
       "finish processing  webnyheter2013.txt\n",
@@ -172,7 +163,7 @@
     "    print(\"finish processing \",fname)\n",
     "    f.close()\n",
     "    \n",
-    "out_path='./dataset/SV/'\n",
+    "out_path='../../../../dataset/SV/'\n",
     "xml_f=out_path+'webbnyheter2013.xml'\n",
     "if xml_f.endswith('.xml') :    \n",
     "    corpus = SBCorpusReader(xml_f)\n",
@@ -189,15 +180,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "rubber-finnish",
+   "execution_count": 7,
+   "id": "exterior-episode",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "56k  webbnyheter2013.xml  webnyheter2013.txt\n"
+      "32k  56k  webbnyheter2013.xml  webnyheter2013.txt\n"
      ]
     }
    ],
@@ -207,7 +198,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "round-somewhere",
+   "id": "impaired-sierra",
    "metadata": {},
    "source": [
     "---\n",
@@ -221,7 +212,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "celtic-appreciation",
+   "id": "junior-washington",
    "metadata": {},
    "source": [
     "-----\n",

+ 312 - 150
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-2_SentenceBoundary_and_Deduplicate.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "placed-musician",
+   "id": "cognitive-explanation",
    "metadata": {},
    "source": [
     "# \n",
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "greenhouse-living",
+   "id": "injured-appraisal",
    "metadata": {},
    "source": [
     "--------------------------------------------------------------------------------------------------------------------\n",
@@ -45,9 +45,9 @@
     "            call out a terminal             \n",
     "   ![call out a terminal ](../../pics/Alt_callout2terminals.JPG)\n",
     "   \n",
-    "            cd ./jupyter_notebook/Megatron-LM/tools/openwebtext/\n",
+    "            cd gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/\n",
     "        \n",
-    "            git clone https://github.com/mattilyra/LSH\n",
+    "            git clone https://github.com/mattilyra/LSH.git\n",
     "            cd LSH\n",
     "            pip install -U --user cython>=0.24.1\n",
     "            open setup.py in an editor and modify as below\n",
@@ -58,8 +58,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "broadband-staff",
+   "execution_count": 1,
+   "id": "elect-chair",
    "metadata": {},
    "outputs": [
     {
@@ -67,9 +67,13 @@
      "output_type": "stream",
      "text": [
       "Defaulting to user installation because normal site-packages is not writeable\n",
-      "Requirement already satisfied: ftfy in /home/zcharpy/.local/lib/python3.8/site-packages (6.0.3)\n",
-      "Requirement already satisfied: langdetect in /home/zcharpy/.local/lib/python3.8/site-packages (1.0.9)\n",
-      "Requirement already satisfied: numpy in /opt/conda/lib/python3.8/site-packages (1.19.2)\n",
+      "Collecting ftfy\n",
+      "  Downloading ftfy-6.0.3.tar.gz (64 kB)\n",
+      "\u001b[K     |████████████████████████████████| 64 kB 3.1 MB/s  eta 0:00:01\n",
+      "\u001b[?25hCollecting langdetect\n",
+      "  Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
+      "\u001b[K     |████████████████████████████████| 981 kB 30.5 MB/s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: numpy in /opt/conda/lib/python3.8/site-packages (1.19.2)\n",
       "Requirement already satisfied: torch in /opt/conda/lib/python3.8/site-packages (1.9.0a0+df837d0)\n",
       "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (1.1.4)\n",
       "Requirement already satisfied: nltk in /opt/conda/lib/python3.8/site-packages (3.5)\n",
@@ -77,29 +81,54 @@
       "Requirement already satisfied: boto3 in /opt/conda/lib/python3.8/site-packages (1.17.32)\n",
       "Requirement already satisfied: tqdm in /opt/conda/lib/python3.8/site-packages (4.53.0)\n",
       "Requirement already satisfied: regex in /opt/conda/lib/python3.8/site-packages (2021.3.17)\n",
-      "Requirement already satisfied: bs4 in /home/zcharpy/.local/lib/python3.8/site-packages (0.0.1)\n",
-      "Requirement already satisfied: htmlmin in /home/zcharpy/.local/lib/python3.8/site-packages (0.1.12)\n",
-      "Requirement already satisfied: tldextract in /home/zcharpy/.local/lib/python3.8/site-packages (3.1.0)\n",
-      "Requirement already satisfied: sentence-splitter in /home/zcharpy/.local/lib/python3.8/site-packages (1.4)\n",
-      "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.8/site-packages (from ftfy) (0.2.5)\n",
+      "Requirement already satisfied: bs4 in /home/x_zench/.local/lib/python3.8/site-packages (0.0.1)\n",
+      "Collecting htmlmin\n",
+      "  Downloading htmlmin-0.1.12.tar.gz (19 kB)\n",
+      "Collecting tldextract\n",
+      "  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)\n",
+      "\u001b[K     |████████████████████████████████| 87 kB 10.3 MB/s  eta 0:00:01\n",
+      "\u001b[?25hCollecting sentence-splitter\n",
+      "  Downloading sentence_splitter-1.4-py2.py3-none-any.whl (44 kB)\n",
+      "\u001b[K     |████████████████████████████████| 44 kB 3.2 MB/s s eta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.8/site-packages (from ftfy) (0.2.5)\n",
       "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from langdetect) (1.15.0)\n",
       "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.8/site-packages (from torch) (3.7.4.3)\n",
       "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.8/site-packages (from pandas) (2021.1)\n",
       "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.8/site-packages (from pandas) (2.8.1)\n",
       "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from nltk) (1.0.1)\n",
       "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from nltk) (7.1.2)\n",
-      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.8/site-packages (from boto3) (0.3.6)\n",
       "Requirement already satisfied: botocore<1.21.0,>=1.20.32 in /opt/conda/lib/python3.8/site-packages (from boto3) (1.20.32)\n",
+      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.8/site-packages (from boto3) (0.3.6)\n",
       "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.8/site-packages (from boto3) (0.10.0)\n",
       "Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.8/site-packages (from bs4) (4.9.3)\n",
       "Requirement already satisfied: filelock>=3.0.8 in /opt/conda/lib/python3.8/site-packages (from tldextract) (3.0.12)\n",
       "Requirement already satisfied: requests>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from tldextract) (2.24.0)\n",
       "Requirement already satisfied: idna in /opt/conda/lib/python3.8/site-packages (from tldextract) (2.10)\n",
-      "Requirement already satisfied: requests-file>=1.4 in /home/zcharpy/.local/lib/python3.8/site-packages (from tldextract) (1.5.1)\n",
+      "Collecting requests-file>=1.4\n",
+      "  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)\n",
       "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /opt/conda/lib/python3.8/site-packages (from botocore<1.21.0,>=1.20.32->boto3) (1.25.11)\n",
       "Requirement already satisfied: soupsieve>1.2; python_version >= \"3.0\" in /opt/conda/lib/python3.8/site-packages (from beautifulsoup4->bs4) (2.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.1.0->tldextract) (2020.12.5)\n",
       "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.1.0->tldextract) (3.0.4)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.1.0->tldextract) (2020.12.5)\n"
+      "Building wheels for collected packages: ftfy, langdetect, htmlmin\n",
+      "  Building wheel for ftfy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41914 sha256=c53e0371cfd741f088eee74f8f73b93014e7846ecdabc0b0f52157d3b014124b\n",
+      "  Stored in directory: /home/x_zench/.cache/pip/wheels/7f/40/63/4bf603cec3ecc4a26985405834cb47eb8368bfa59e15dde046\n",
+      "  Building wheel for langdetect (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=755dd5f12ab2219ddb785b5ff2215d15cb75e189b9ff00fa01506c4caab14d18\n",
+      "  Stored in directory: /home/x_zench/.cache/pip/wheels/13/c7/b0/79f66658626032e78fc1a83103690ef6797d551cb22e56e734\n",
+      "  Building wheel for htmlmin (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for htmlmin: filename=htmlmin-0.1.12-py3-none-any.whl size=27084 sha256=bf688cdafbb2a552e92b829fa0fc877d8739d7b32740491ea9ea0c04afdc6f61\n",
+      "  Stored in directory: /home/x_zench/.cache/pip/wheels/23/14/6e/4be5bfeeb027f4939a01764b48edd5996acf574b0913fe5243\n",
+      "Successfully built ftfy langdetect htmlmin\n",
+      "Installing collected packages: ftfy, langdetect, htmlmin, requests-file, tldextract, sentence-splitter\n",
+      "\u001b[33m  WARNING: The script ftfy is installed in '/home/x_zench/.local/bin' which is not on PATH.\n",
+      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
+      "\u001b[33m  WARNING: The script htmlmin is installed in '/home/x_zench/.local/bin' which is not on PATH.\n",
+      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
+      "\u001b[33m  WARNING: The script tldextract is installed in '/home/x_zench/.local/bin' which is not on PATH.\n",
+      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
+      "Successfully installed ftfy-6.0.3 htmlmin-0.1.12 langdetect-1.0.9 requests-file-1.5.1 sentence-splitter-1.4 tldextract-3.1.2\n"
      ]
     }
    ],
@@ -109,7 +138,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cordless-square",
+   "id": "excessive-madison",
    "metadata": {},
    "source": [
     "-------------------------------------------------------------------------------\n",
@@ -118,8 +147,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "selected-panel",
+   "execution_count": 2,
+   "id": "choice-nicholas",
    "metadata": {},
    "outputs": [
     {
@@ -128,7 +157,7 @@
        "'sv'"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -141,8 +170,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "binding-arthur",
+   "execution_count": 3,
+   "id": "nonprofit-statistics",
    "metadata": {},
    "outputs": [
     {
@@ -151,7 +180,7 @@
        "'da'"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -163,8 +192,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "whole-advisory",
+   "execution_count": 4,
+   "id": "adverse-robertson",
    "metadata": {},
    "outputs": [
     {
@@ -173,7 +202,7 @@
        "'fi'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -185,7 +214,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "hourly-capital",
+   "id": "interpreted-links",
    "metadata": {},
    "source": [
     "-----------------------------------------------------------\n",
@@ -194,16 +223,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "running-incentive",
+   "execution_count": 5,
+   "id": "typical-accused",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package punkt to /home/zcharpy/nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
+      "[nltk_data] Downloading package punkt to /home/x_zench/nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
      ]
     },
     {
@@ -212,7 +241,7 @@
        "True"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -224,8 +253,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "thermal-element",
+   "execution_count": 6,
+   "id": "ranking-semester",
    "metadata": {},
    "outputs": [
     {
@@ -254,7 +283,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "verbal-consortium",
+   "id": "portable-bumper",
    "metadata": {},
    "source": [
     "-----------------------------------------------------------\n",
@@ -263,8 +292,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "afraid-armor",
+   "execution_count": 8,
+   "id": "environmental-rating",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2021-09-15 11:26:03--  https://github.com/mediacloud/sentence-splitter/blob/develop/sentence_splitter/non_breaking_prefixes/sv.txt\n",
+      "Resolving github.com (github.com)... 140.82.121.4\n",
+      "Connecting to github.com (github.com)|140.82.121.4|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: unspecified [text/html]\n",
+      "Saving to: ‘sv.txt’\n",
+      "\n",
+      "sv.txt                  [ <=>                ] 191.69K  --.-KB/s    in 0.08s   \n",
+      "\n",
+      "2021-09-15 11:26:04 (2.23 MB/s) - ‘sv.txt’ saved [196294]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://github.com/mediacloud/sentence-splitter/blob/develop/sentence_splitter/non_breaking_prefixes/sv.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "collective-medication",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mv sv.txt custom_english_non_breaking_prefixes.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "secure-encounter",
    "metadata": {},
    "outputs": [
     {
@@ -289,7 +356,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "activated-blair",
+   "id": "varying-province",
    "metadata": {},
    "source": [
     "-----------------------------------------------------------\n",
@@ -298,8 +365,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "noble-elements",
+   "execution_count": 11,
+   "id": "voluntary-madness",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -334,17 +401,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "personal-knowing",
+   "execution_count": 12,
+   "id": "raising-salad",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "------- sentence 3 -------\n",
+      "------- sentence 1 -------\n",
       "Andersson pekas ut som nästa partiledare:\n",
-      "------- sentence 4 -------\n",
+      "------- sentence 2 -------\n",
       "“Medlemmarna ska säga sitt”\n"
      ]
     }
@@ -360,7 +427,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "monthly-impossible",
+   "id": "facial-trading",
    "metadata": {},
    "source": [
     "-----------------------------------------------------------\n",
@@ -370,7 +437,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "modified-carpet",
+   "id": "agricultural-onion",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -418,8 +485,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "civil-range",
+   "execution_count": 2,
+   "id": "nonprofit-panama",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -432,19 +499,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "electoral-mining",
+   "execution_count": 3,
+   "id": "empty-while",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.6\n",
-      "0.62\n",
       "0.61\n",
-      "0.66\n",
-      "0.63\n"
+      "0.72\n",
+      "0.6\n",
+      "0.6\n",
+      "0.66\n"
      ]
     }
    ],
@@ -459,7 +526,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "guided-hardware",
+   "id": "blind-union",
    "metadata": {},
    "source": [
     "## dataset extracted from NVIDIA blog urls "
@@ -467,8 +534,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "suitable-director",
+   "execution_count": 4,
+   "id": "instant-grade",
    "metadata": {},
    "outputs": [
     {
@@ -498,23 +565,23 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Today, NVIDIA announced new pretrained models ...</td>\n",
+       "      <td>Deep learning models have been successfully us...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>This post was updated July 20, 2021 to reflect...</td>\n",
+       "      <td>Breast cancer is the most frequently diagnosed...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>In part 1 of this series, we introduced new AP...</td>\n",
+       "      <td>The NVIDIA Deep Learning Institute (DLI) exten...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>The NVIDIA NGC team is hosting a webinar with ...</td>\n",
+       "      <td>Engineers, product developers and designers ar...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>NVIDIA announces our newest release of the CUD...</td>\n",
+       "      <td>Despite substantial progress in natural langua...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -522,14 +589,14 @@
       ],
       "text/plain": [
        "                                                doc1\n",
-       "0  Today, NVIDIA announced new pretrained models ...\n",
-       "1  This post was updated July 20, 2021 to reflect...\n",
-       "2  In part 1 of this series, we introduced new AP...\n",
-       "3  The NVIDIA NGC team is hosting a webinar with ...\n",
-       "4  NVIDIA announces our newest release of the CUD..."
+       "0  Deep learning models have been successfully us...\n",
+       "1  Breast cancer is the most frequently diagnosed...\n",
+       "2  The NVIDIA Deep Learning Institute (DLI) exten...\n",
+       "3  Engineers, product developers and designers ar...\n",
+       "4  Despite substantial progress in natural langua..."
       ]
      },
-     "execution_count": 6,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -543,7 +610,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "massive-tenant",
+   "id": "attached-candle",
    "metadata": {},
    "source": [
     "## create our own groudtruth dataset"
@@ -551,8 +618,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "bigger-elder",
+   "execution_count": 5,
+   "id": "constant-mouth",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -582,8 +649,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
-   "id": "automatic-sheffield",
+   "execution_count": 6,
+   "id": "accepting-truck",
    "metadata": {},
    "outputs": [
     {
@@ -615,38 +682,38 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>68</th>\n",
-       "      <td>68</td>\n",
-       "      <td>Despite substantial progress in natural langua...</td>\n",
-       "      <td>Have a story to share? Submit an idea.Get the ...</td>\n",
-       "      <td>False</td>\n",
+       "      <th>65</th>\n",
+       "      <td>65</td>\n",
+       "      <td>This post was updated July 20, 2021 to reflect...</td>\n",
+       "      <td>This post was updated July 20, 2021 to reflect...</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>69</th>\n",
-       "      <td>69</td>\n",
-       "      <td>Engineers, product developers and designers ar...</td>\n",
-       "      <td>The NGC team is hosting a webinar and live Q&amp;A...</td>\n",
+       "      <th>66</th>\n",
+       "      <td>66</td>\n",
+       "      <td>Researchers, developers, and engineers worldwi...</td>\n",
+       "      <td>This post was originally published in August 2...</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>70</th>\n",
-       "      <td>70</td>\n",
-       "      <td>NVIDIA NeMo is a conversational AI toolkit bui...</td>\n",
-       "      <td>NVIDIA NeMo is a conversational AI toolkit bui...</td>\n",
-       "      <td>True</td>\n",
+       "      <th>67</th>\n",
+       "      <td>67</td>\n",
+       "      <td>Looking to reveal secrets of days past, histor...</td>\n",
+       "      <td>The NVIDIA Deep Learning Institute (DLI) exten...</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>71</th>\n",
-       "      <td>71</td>\n",
-       "      <td>In NVIDIA Clara Train 4.0, we added homomorphi...</td>\n",
-       "      <td>In NVIDIA Clara Train 4.0, we added homomorphi...</td>\n",
-       "      <td>True</td>\n",
+       "      <th>68</th>\n",
+       "      <td>68</td>\n",
+       "      <td>Scientists searching the universe for gravitat...</td>\n",
+       "      <td>Robotics researchers from NVIDIA and Universit...</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>72</th>\n",
-       "      <td>72</td>\n",
-       "      <td>Targeting areas populated with disease-carryin...</td>\n",
-       "      <td>The NVIDIA NGC team is hosting a webinar with ...</td>\n",
+       "      <th>69</th>\n",
+       "      <td>69</td>\n",
+       "      <td>At GTC ’21, experts presented a variety of tec...</td>\n",
+       "      <td>The NVIDIA Hardware Grant Program helps advanc...</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -655,21 +722,21 @@
       ],
       "text/plain": [
        "    index                                               doc1  \\\n",
-       "68     68  Despite substantial progress in natural langua...   \n",
-       "69     69  Engineers, product developers and designers ar...   \n",
-       "70     70  NVIDIA NeMo is a conversational AI toolkit bui...   \n",
-       "71     71  In NVIDIA Clara Train 4.0, we added homomorphi...   \n",
-       "72     72  Targeting areas populated with disease-carryin...   \n",
+       "65     65  This post was updated July 20, 2021 to reflect...   \n",
+       "66     66  Researchers, developers, and engineers worldwi...   \n",
+       "67     67  Looking to reveal secrets of days past, histor...   \n",
+       "68     68  Scientists searching the universe for gravitat...   \n",
+       "69     69  At GTC ’21, experts presented a variety of tec...   \n",
        "\n",
        "                                                 doc2  duplicate  \n",
-       "68  Have a story to share? Submit an idea.Get the ...      False  \n",
-       "69  The NGC team is hosting a webinar and live Q&A...      False  \n",
-       "70  NVIDIA NeMo is a conversational AI toolkit bui...       True  \n",
-       "71  In NVIDIA Clara Train 4.0, we added homomorphi...       True  \n",
-       "72  The NVIDIA NGC team is hosting a webinar with ...      False  "
+       "65  This post was updated July 20, 2021 to reflect...       True  \n",
+       "66  This post was originally published in August 2...      False  \n",
+       "67  The NVIDIA Deep Learning Institute (DLI) exten...      False  \n",
+       "68  Robotics researchers from NVIDIA and Universit...      False  \n",
+       "69  The NVIDIA Hardware Grant Program helps advanc...      False  "
       ]
      },
-     "execution_count": 67,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -681,19 +748,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "french-solution",
+   "execution_count": 7,
+   "id": "acting-tiffany",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "False    42\n",
-       "True     31\n",
+       "False    45\n",
+       "True     25\n",
        "Name: duplicate, dtype: int64"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -705,8 +772,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
-   "id": "tight-complexity",
+   "execution_count": 8,
+   "id": "wicked-youth",
    "metadata": {},
    "outputs": [
     {
@@ -715,7 +782,7 @@
        "Index(['index', 'doc1', 'doc2', 'duplicate'], dtype='object')"
       ]
      },
-     "execution_count": 76,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -726,10 +793,92 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "dried-section",
+   "execution_count": 9,
+   "id": "rural-lotus",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>doc1</th>\n",
+       "      <th>doc2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Deep learning models have been successfully us...</td>\n",
+       "      <td>Deep learning models have been successfully us...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Breast cancer is the most frequently diagnosed...</td>\n",
+       "      <td>In NVIDIA Clara Train 4.0, we added homomorphi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>The NVIDIA Deep Learning Institute (DLI) exten...</td>\n",
+       "      <td>The NVIDIA Deep Learning Institute (DLI) exten...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Engineers, product developers and designers ar...</td>\n",
+       "      <td>Deep learning research requires working at sca...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Despite substantial progress in natural langua...</td>\n",
+       "      <td>NVIDIA announces our newest release of the CUD...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index                                               doc1  \\\n",
+       "0      0  Deep learning models have been successfully us...   \n",
+       "1      1  Breast cancer is the most frequently diagnosed...   \n",
+       "2      2  The NVIDIA Deep Learning Institute (DLI) exten...   \n",
+       "3      3  Engineers, product developers and designers ar...   \n",
+       "4      4  Despite substantial progress in natural langua...   \n",
+       "\n",
+       "                                                doc2  \n",
+       "0  Deep learning models have been successfully us...  \n",
+       "1  In NVIDIA Clara Train 4.0, we added homomorphi...  \n",
+       "2  The NVIDIA Deep Learning Institute (DLI) exten...  \n",
+       "3  Deep learning research requires working at sca...  \n",
+       "4  NVIDIA announces our newest release of the CUD...  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "del df\n",
     "keep_cols_to_write=['index','doc1','doc2']\n",
@@ -739,7 +888,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "italic-statement",
+   "id": "dominican-trick",
    "metadata": {},
    "source": [
     "---\n",
@@ -748,8 +897,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "motivated-saudi",
+   "execution_count": 10,
+   "id": "married-straight",
    "metadata": {},
    "outputs": [
     {
@@ -835,7 +984,7 @@
        "4  As an undergraduate student excited about AI f...      False  "
       ]
      },
-     "execution_count": 8,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -848,8 +997,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "spare-springer",
+   "execution_count": 11,
+   "id": "rocky-courage",
    "metadata": {},
    "outputs": [
     {
@@ -860,7 +1009,7 @@
        "Name: duplicate, dtype: int64"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -873,7 +1022,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "premium-debate",
+   "id": "boring-piece",
    "metadata": {},
    "outputs": [
     {
@@ -890,7 +1039,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "looking-funds",
+   "id": "fresh-norfolk",
    "metadata": {},
    "source": [
     "---\n",
@@ -900,24 +1049,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
-   "id": "potential-functionality",
+   "execution_count": 13,
+   "id": "starting-arabic",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "pair of similar sentences with jaccard_sim score:0.9685534591194969 and minhash_sim score:0.9801980198019802 --- \n",
-      "\n",
-      "text_a: ['Have', 'a', 'story', 'to', 'share?']\n",
-      "text_b: ['\\xa0Read', 'more', '>>>Read', 'the', 'full']\n",
-      "--------------------------------------------------\n",
-      "pair of similar sentences with jaccard_sim score:0.8197797952482132 and minhash_sim score:0.7543859649122807 --- \n",
+      "pair of similar sentences with jaccard_sim score:0.8197797952482132 and minhash_sim score:0.639344262295082 --- \n",
       "\n",
       "text_a: ['The', 'NVIDIA,', 'Facebook,', 'and', 'TensorFlow']\n",
       "text_b: ['Deep', 'learning', '(DL)', 'is', 'the']\n",
       "--------------------------------------------------\n",
+      "pair of similar sentences with jaccard_sim score:0.9133693568066934 and minhash_sim score:0.8867924528301887 --- \n",
+      "\n",
+      "100% duplicates \n",
+      "\n",
+      "text_a: ['The', 'first', 'post', 'in', 'this']\n",
+      "text_b: ['The', 'first', 'post', 'in', 'this']\n",
+      "--------------------------------------------------\n",
       "\n",
       "There are **3** candidate duplicates in total\n",
       "\n"
@@ -926,10 +1077,10 @@
     {
      "data": {
       "text/plain": [
-       "[('17', '38', 0.8650075414781297, 0.6949152542372882)]"
+       "[('25', '51', 0.9685534591194969, 0.9607843137254902)]"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -994,7 +1145,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fewer-template",
+   "id": "banner-dispute",
    "metadata": {},
    "source": [
     "---\n",
@@ -1003,8 +1154,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "actual-holocaust",
+   "execution_count": 14,
+   "id": "spread-entity",
    "metadata": {},
    "outputs": [
     {
@@ -1015,7 +1166,7 @@
        "Name: duplicate, dtype: int64"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1026,8 +1177,19 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "exempt-juice",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clean up \n",
+    "!rm custom_english_non_breaking_prefixes.txt"
+   ]
+  },
+  {
    "cell_type": "markdown",
-   "id": "contemporary-accreditation",
+   "id": "abandoned-valuation",
    "metadata": {},
    "source": [
     "<a id=\"TheChallenge\"></a>"
@@ -1035,14 +1197,14 @@
   },
   {
    "cell_type": "markdown",
-   "id": "moved-housing",
+   "id": "thick-external",
    "metadata": {},
    "source": [
     "---\n",
     "# Mini Challenge - approaching the groundtruth !\n",
     "\n",
     "Task : Aiming to approach the number 31 modifying the below parameters\n",
-    "rerun cell <a href=\"./Day3-1_SentenceBoundary_and_Deduplicate.ipynb#Rerun_Cell\">Jump to ReRun Cell</a>\n",
+    "rerun cell <a href=\"./Day3-2_SentenceBoundary_and_Deduplicate.ipynb#Rerun_Cell\">Jump to ReRun Cell</a>\n",
     "\n",
     "Consider yourself pass this mini challenge when you approach the number **31 +/- 3** ! \n",
     "\n",
@@ -1060,7 +1222,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
-   "id": "approved-stanford",
+   "id": "sophisticated-boating",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -1095,7 +1257,7 @@
   {
    "cell_type": "code",
    "execution_count": 114,
-   "id": "recent-father",
+   "id": "meaningful-sample",
    "metadata": {
     "jupyter": {
      "source_hidden": true
@@ -1116,7 +1278,7 @@
   {
    "cell_type": "code",
    "execution_count": 115,
-   "id": "aggressive-craps",
+   "id": "operational-steps",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -1152,7 +1314,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "human-palmer",
+   "id": "revolutionary-framing",
    "metadata": {},
    "source": [
     "---\n",
@@ -1166,7 +1328,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "permanent-effectiveness",
+   "id": "cutting-greeting",
    "metadata": {},
    "source": [
     "-----\n",

Разница между файлами не показана из-за своего большого размера
+ 221 - 102
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb


+ 1 - 0
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/fetchURLs_and_write2html.sh

@@ -1,3 +1,4 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
 #!/bin/bash
 export PATH="${PATH}:${HOME}/.local/bin"
 input="NVdevblog_urls.txt"

+ 6 - 6
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/verify_GPT3_Svenska.sh

@@ -1,17 +1,17 @@
 # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
-GPUS_PER_NODE=4
+GPUS_PER_NODE=2
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=8
+WORLD_SIZE=2
 TENSOR_MP_SIZE=1
 PIPELINE_MP_SIZE=1
-CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'
-DATA_PATH='../dataset/EN/NVblogs_text_document'
-VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
-MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
+CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it 
+DATA_PATH='../dataset/EN/NVblog_text_document' ## modify this path if you customize it 
+VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json' ## modify this path if you customize it 
+MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt' ## modify this path if you customize it 
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 

+ 5 - 5
ai/Megatron/English/Python/jupyter_notebook/params_cnt.sh

@@ -1,9 +1,9 @@
 # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
-NLAYERS=32
-NHIDDEN=2048
-NHEADS=32
-SEQ_LEN=64
-VOCAB_SIZE=32000
+NLAYERS=32 ## modify this param
+NHIDDEN=4096 ## modify this param
+NHEADS=32 ## modify this param
+SEQ_LEN=512 ## modify this param
+VOCAB_SIZE=56000 ## modify this param
 
 MODEL_SIZE=$((($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ) / 10**9))
 EXACT_MODEL_SIZE=$(($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ))

+ 4 - 1
ai/Megatron/English/Python/source_code/create_dir_and_download_pytorch_sif_file.sh

@@ -46,8 +46,11 @@ mv toydata.zip ./dataset/
 cd dataset/
 unzip -r toydata.zip
 cd ..
-ls ./dataset/
 
 ### move the Megatron run script to the correct directory
 cp ./gpubootcamp/ai/Megatron/English/Python/source_code/Day1-runMegatron-LM_GPT_template.sh ./
+
+### clone Megatron-LM repo onto the current dir
+git clone https://github.com/NVIDIA/Megatron-LM.git
+ls .
 echo "done !"