Pārlūkot izejas kodu

edit to train sampler instead of train iter

zenodia 3 gadi atpakaļ
vecāks
revīzija
f49a3d6c00

+ 69 - 50
ai/Megatron/English/Python/jupyter_notebook/Day3-5_run_Megatron_with_varying_config.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "external-dating",
+   "id": "postal-promotion",
    "metadata": {},
    "source": [
     "# \n",
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "signal-skiing",
+   "id": "deadly-windows",
    "metadata": {},
    "source": [
     "<a id=\"Rerun_Cell\"></a>"
@@ -59,7 +59,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "durable-closing",
+   "id": "constant-affairs",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,8 +68,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "id": "formal-steam",
+   "execution_count": 2,
+   "id": "transsexual-costume",
    "metadata": {},
    "outputs": [
     {
@@ -101,13 +101,13 @@
     "\n",
     "TENSOR_MP_SIZE=8\n",
     "PIPELINE_MP_SIZE=1\n",
-    "LAYERS=\n",
-    "HIDDEN_SZ=\n",
-    "NUM_ATTN_HEADS=\n",
-    "MICRO_BZ=\n",
-    "GLOBAL_BZ=\n",
-    "SEQ_LEN=\n",
-    "MAX_POS_EM=\n",
+    "LAYERS=32\n",
+    "HIDDEN_SZ=2048\n",
+    "NUM_ATTN_HEADS=32\n",
+    "MICRO_BZ=64\n",
+    "GLOBAL_BZ=512\n",
+    "SEQ_LEN=512\n",
+    "MAX_POS_EM=512\n",
     "#### -------------------- end of blocks ------------------------#### \n",
     "\n",
     "export OMP_NUM_THREADS=1\n",
@@ -129,7 +129,7 @@
     "       --train-samples 100 \\\n",
     "       --save $CHECKPOINT_PATH \\\n",
     "       --load $CHECKPOINT_PATH \\\n",
-    "       --data-path $DATA_PATH \\\n",
+    "       --data-path 1. $DATA_PATH \\\n",
     "       --vocab-file $VOCAB_FILE \\\n",
     "       --merge-file $MERGE_FILE \\\n",
     "       --data-impl mmap \\\n",
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "facial-teacher",
+   "id": "constant-fighter",
    "metadata": {},
    "source": [
     "---\n",
@@ -162,7 +162,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
-   "id": "eastern-prairie",
+   "id": "weekly-mechanics",
    "metadata": {},
    "outputs": [
     {
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "three-remainder",
+   "id": "concrete-finland",
    "metadata": {},
    "source": [
     "---\n",
@@ -204,8 +204,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "weekly-number",
+   "execution_count": 3,
+   "id": "experienced-standing",
    "metadata": {},
    "outputs": [
     {
@@ -221,15 +221,15 @@
       "Initializing NVTX monkey patches\n",
       "Initializing NVTX monkey patches\n",
       "Initializing NVTX monkey patches\n",
+      "Initializing NVTX monkey patches\n",
+      "Initializing NVTX monkey patches\n",
       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
-      "Initializing NVTX monkey patchesDone with NVTX monkey patching\n",
-      "\n",
+      "Done with NVTX monkey patching\n",
+      "Initializing NVTX monkey patches\n",
       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
       "Done with NVTX monkey patching\n",
-      "Initializing NVTX monkey patches\n",
-      "Initializing NVTX monkey patches\n",
       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
       "Done with NVTX monkey patching\n",
@@ -273,7 +273,7 @@
       "  consumed_valid_samples .......................... 0\n",
       "  data_impl ....................................... mmap\n",
       "  data_parallel_size .............................. 1\n",
-      "  data_path ....................................... ['../dataset/EN/NVblogs_text_document']\n",
+      "  data_path ....................................... ['1.', '../dataset/EN/NVblogs_text_document']\n",
       "  dataloader_type ................................. single\n",
       "  DDP_impl ........................................ local\n",
       "  decoder_seq_length .............................. None\n",
@@ -399,7 +399,7 @@
       "make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
       "make: Nothing to be done for 'default'.\n",
       "make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
-      ">>> done with dataset index builder. Compilation time: 0.644 seconds\n",
+      ">>> done with dataset index builder. Compilation time: 0.622 seconds\n",
       "> compiling and loading fused kernels ...\n",
       "Detected CUDA files, patching ldflags\n",
       "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
@@ -419,24 +419,24 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_mix_prec_layer_norm_cuda...\n",
-      ">>> done with compiling and loading fused kernels. Compilation time: 29.070 seconds\n",
-      "time to initialize megatron (seconds): 21.626\n",
-      "[after megatron is initialized] datetime: 2021-08-27 06:24:37 \n",
+      ">>> done with compiling and loading fused kernels. Compilation time: 31.948 seconds\n",
+      "time to initialize megatron (seconds): 9.410\n",
+      "[after megatron is initialized] datetime: 2021-08-30 05:52:25 \n",
       "building GPT model ...\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
       " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 215937024\n",
       " > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 215937024\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
+      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
+      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
       " > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 215937024\n",
-      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024\n",
+      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
+      "\n",
       " > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 215937024\n",
       "setting training iterations to 0\n",
       "> learning rate decay style: cosine\n",
       "WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n",
       "    will not load any checkpoints and will start from random\n",
-      "time (ms) | load-checkpoint: 28.95\n",
-      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-27 06:24:41 \n",
+      "time (ms) | load-checkpoint: 54.82\n",
+      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 05:52:29 \n",
       "> building train, validation, and test datasets ...\n",
       " > datasets target sizes (minimum size):\n",
       "    train:      100\n",
@@ -449,7 +449,7 @@
       "    reading document index...\n",
       "    creating numpy buffer of mmap...\n",
       "    creating memory view of numpy buffer...\n",
-      " > finished creating indexed dataset in 0.003079 seconds\n",
+      " > finished creating indexed dataset in 0.005710 seconds\n",
       "    number of documents: 74\n",
       " > dataset split:\n",
       "    train:\n",
@@ -458,34 +458,53 @@
       "     document indices in [70, 74) total of 4 documents\n",
       "    test:\n",
       "     document indices in [74, 74) total of 0 documents\n",
-      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_doc_idx.npy\n",
-      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_sample_idx.npy\n",
-      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_shuffle_idx.npy\n",
+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
+      " > only one epoch required, setting separate_last_epoch to False\n",
+      " > elasped time to build and save doc-idx mapping (seconds): 0.002701\n",
+      "    using:\n",
+      "     number of documents:       70\n",
+      "     number of epochs:          1\n",
+      "     sequence length:           512\n",
+      "     total number of samples:   141\n",
+      " > elasped time to build and save sample-idx mapping (seconds): 0.003840\n",
+      " > building shuffle index with split [0, 141) and [141, 141) ...\n",
+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.001237\n",
+      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
+      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
+      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
       "    loaded indexed file in 0.003 seconds\n",
       "    total number of samples: 142\n",
       "    total number of epochs: 1\n",
-      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_doc_idx.npy\n",
-      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_sample_idx.npy\n",
-      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_shuffle_idx.npy\n",
+      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
+      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
+      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
       "    loaded indexed file in 0.003 seconds\n",
-      "    total number of samples: 5122\n",
-      "    total number of epochs: 660\n",
+      "    total number of samples: 5153\n",
+      "    total number of epochs: 664\n",
+      "> building indices for blendable datasets ...\n",
+      " > sample ratios:\n",
+      "   dataset 0, input: 1, achieved: 1\n",
+      "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
+      "> building indices for blendable datasets ...\n",
+      " > sample ratios:\n",
+      "   dataset 0, input: 1, achieved: 1\n",
+      "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
       "> finished creating GPT datasets ...\n",
-      "[after dataloaders are built] datetime: 2021-08-27 06:24:46 \n",
+      "[after dataloaders are built] datetime: 2021-08-30 05:52:33 \n",
       "done with setup ...\n",
       "training ...\n",
-      "time (ms) | model-and-optimizer-setup: 4013.85 | train/valid/test-data-iterators-setup: 2773.74\n",
-      "[after training is done] datetime: 2021-08-27 06:24:46 \n",
+      "time (ms) | model-and-optimizer-setup: 3872.16 | train/valid/test-data-iterators-setup: 2957.85\n",
+      "[after training is done] datetime: 2021-08-30 05:52:34 \n",
       "------------------------------------------------------------------------------------------------------------------\n",
-      " validation loss at the end of training for val data | lm loss value: 1.124495E+01 | lm loss PPL: 7.649290E+04 | \n",
+      " validation loss at the end of training for val data | lm loss value: 1.124461E+01 | lm loss PPL: 7.646642E+04 | \n",
       "------------------------------------------------------------------------------------------------------------------\n",
       "Processing events...\n",
       "Capturing symbol files...\n",
-      "Saving temporary \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdstrm\" file to disk...\n",
+      "Saving temporary \"/tmp/nsys-report-c19d-0647-d480-f24c.qdstrm\" file to disk...\n",
       "Creating final output files...\n",
       "\n",
       "Processing [==============================================================100%]\n",
-      "Saved report file to \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdrep\"\n",
+      "Saved report file to \"/tmp/nsys-report-c19d-0647-d480-f24c.qdrep\"\n",
       "Report file moved to \"/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved.qdrep\"\n"
      ]
     }
@@ -496,7 +515,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "parallel-language",
+   "id": "spoken-night",
    "metadata": {},
    "source": [
     "## Remember to copy and paste your output on Slack or Zoom\n",
@@ -505,7 +524,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "german-aberdeen",
+   "id": "interpreted-escape",
    "metadata": {},
    "source": [
     "-----\n",

+ 2 - 2
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_SVGPT_BIG.sh

@@ -11,7 +11,7 @@ VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
 MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
 PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path
 
-#### --------------- params in the following block are allowed to change -----------#### 
+#### [TODO]--------------- params in the following block are allowed to change -----------#### 
 WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
 GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
 
@@ -45,7 +45,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --train-samples 100 \
        --save $CHECKPOINT_PATH \
        --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
+       --data-path 1. $DATA_PATH \
        --vocab-file $VOCAB_FILE \
        --merge-file $MERGE_FILE \
        --data-impl mmap \