4 years ago · f49a3d6c00
--- a/ai/Megatron/English/Python/jupyter_notebook/Day3-5_run_Megatron_with_varying_config.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Day3-5_run_Megatron_with_varying_config.ipynb
@@ -2,7 +2,7 @@
 
				  "cells": [
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "external-dating",
			
 
				+   "id": "postal-promotion",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "# \n",
			
@@ -50,7 +50,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "signal-skiing",
			
 
				+   "id": "deadly-windows",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "<a id=\"Rerun_Cell\"></a>"
			
@@ -59,7 +59,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 3,
			
 
				-   "id": "durable-closing",
			
 
				+   "id": "constant-affairs",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -68,8 +68,8 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 34,
			
 
				-   "id": "formal-steam",
			
 
				+   "execution_count": 2,
			
 
				+   "id": "transsexual-costume",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -101,13 +101,13 @@
 
				     "\n",
			
 
				     "TENSOR_MP_SIZE=8\n",
			
 
				     "PIPELINE_MP_SIZE=1\n",
			
 
				-    "LAYERS=\n",
			
 
				-    "HIDDEN_SZ=\n",
			
 
				-    "NUM_ATTN_HEADS=\n",
			
 
				-    "MICRO_BZ=\n",
			
 
				-    "GLOBAL_BZ=\n",
			
 
				-    "SEQ_LEN=\n",
			
 
				-    "MAX_POS_EM=\n",
			
 
				+    "LAYERS=32\n",
			
 
				+    "HIDDEN_SZ=2048\n",
			
 
				+    "NUM_ATTN_HEADS=32\n",
			
 
				+    "MICRO_BZ=64\n",
			
 
				+    "GLOBAL_BZ=512\n",
			
 
				+    "SEQ_LEN=512\n",
			
 
				+    "MAX_POS_EM=512\n",
			
 
				     "#### -------------------- end of blocks ------------------------#### \n",
			
 
				     "\n",
			
 
				     "export OMP_NUM_THREADS=1\n",
			
@@ -129,7 +129,7 @@
 
				     "       --train-samples 100 \\\n",
			
 
				     "       --save $CHECKPOINT_PATH \\\n",
			
 
				     "       --load $CHECKPOINT_PATH \\\n",
			
 
				-    "       --data-path $DATA_PATH \\\n",
			
 
				+    "       --data-path 1. $DATA_PATH \\\n",
			
 
				     "       --vocab-file $VOCAB_FILE \\\n",
			
 
				     "       --merge-file $MERGE_FILE \\\n",
			
 
				     "       --data-impl mmap \\\n",
			
@@ -151,7 +151,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "facial-teacher",
			
 
				+   "id": "constant-fighter",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -162,7 +162,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 39,
			
 
				-   "id": "eastern-prairie",
			
 
				+   "id": "weekly-mechanics",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -180,7 +180,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "three-remainder",
			
 
				+   "id": "concrete-finland",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -204,8 +204,8 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 35,
			
 
				-   "id": "weekly-number",
			
 
				+   "execution_count": 3,
			
 
				+   "id": "experienced-standing",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -221,15 +221,15 @@
 
				       "Initializing NVTX monkey patches\n",
			
 
				       "Initializing NVTX monkey patches\n",
			
 
				       "Initializing NVTX monkey patches\n",
			
 
				+      "Initializing NVTX monkey patches\n",
			
 
				+      "Initializing NVTX monkey patches\n",
			
 
				       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
			
 
				       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
			
 
				-      "Initializing NVTX monkey patchesDone with NVTX monkey patching\n",
			
 
				-      "\n",
			
 
				+      "Done with NVTX monkey patching\n",
			
 
				+      "Initializing NVTX monkey patches\n",
			
 
				       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
			
 
				       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
			
 
				       "Done with NVTX monkey patching\n",
			
 
				-      "Initializing NVTX monkey patches\n",
			
 
				-      "Initializing NVTX monkey patches\n",
			
 
				       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
			
 
				       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
			
 
				       "Done with NVTX monkey patching\n",
			
@@ -273,7 +273,7 @@
 
				       "  consumed_valid_samples .......................... 0\n",
			
 
				       "  data_impl ....................................... mmap\n",
			
 
				       "  data_parallel_size .............................. 1\n",
			
 
				-      "  data_path ....................................... ['../dataset/EN/NVblogs_text_document']\n",
			
 
				+      "  data_path ....................................... ['1.', '../dataset/EN/NVblogs_text_document']\n",
			
 
				       "  dataloader_type ................................. single\n",
			
 
				       "  DDP_impl ........................................ local\n",
			
 
				       "  decoder_seq_length .............................. None\n",
			
@@ -399,7 +399,7 @@
 
				       "make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
			
 
				       "make: Nothing to be done for 'default'.\n",
			
 
				       "make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
			
 
				-      ">>> done with dataset index builder. Compilation time: 0.644 seconds\n",
			
 
				+      ">>> done with dataset index builder. Compilation time: 0.622 seconds\n",
			
 
				       "> compiling and loading fused kernels ...\n",
			
 
				       "Detected CUDA files, patching ldflags\n",
			
 
				       "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
			
@@ -419,24 +419,24 @@
 
				       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
			
 
				       "ninja: no work to do.\n",
			
 
				       "Loading extension module fused_mix_prec_layer_norm_cuda...\n",
			
 
				-      ">>> done with compiling and loading fused kernels. Compilation time: 29.070 seconds\n",
			
 
				-      "time to initialize megatron (seconds): 21.626\n",
			
 
				-      "[after megatron is initialized] datetime: 2021-08-27 06:24:37 \n",
			
 
				+      ">>> done with compiling and loading fused kernels. Compilation time: 31.948 seconds\n",
			
 
				+      "time to initialize megatron (seconds): 9.410\n",
			
 
				+      "[after megatron is initialized] datetime: 2021-08-30 05:52:25 \n",
			
 
				       "building GPT model ...\n",
			
 
				-      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
			
 
				-      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
			
 
				       " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 215937024\n",
			
 
				       " > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 215937024\n",
			
 
				-      " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
			
 
				+      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
			
 
				+      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
			
 
				       " > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 215937024\n",
			
 
				-      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024\n",
			
 
				+      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
			
 
				+      "\n",
			
 
				       " > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 215937024\n",
			
 
				       "setting training iterations to 0\n",
			
 
				       "> learning rate decay style: cosine\n",
			
 
				       "WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n",
			
 
				       "    will not load any checkpoints and will start from random\n",
			
 
				-      "time (ms) | load-checkpoint: 28.95\n",
			
 
				-      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-27 06:24:41 \n",
			
 
				+      "time (ms) | load-checkpoint: 54.82\n",
			
 
				+      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 05:52:29 \n",
			
 
				       "> building train, validation, and test datasets ...\n",
			
 
				       " > datasets target sizes (minimum size):\n",
			
 
				       "    train:      100\n",
			
@@ -449,7 +449,7 @@
 
				       "    reading document index...\n",
			
 
				       "    creating numpy buffer of mmap...\n",
			
 
				       "    creating memory view of numpy buffer...\n",
			
 
				-      " > finished creating indexed dataset in 0.003079 seconds\n",
			
 
				+      " > finished creating indexed dataset in 0.005710 seconds\n",
			
 
				       "    number of documents: 74\n",
			
 
				       " > dataset split:\n",
			
 
				       "    train:\n",
			
@@ -458,34 +458,53 @@
 
				       "     document indices in [70, 74) total of 4 documents\n",
			
 
				       "    test:\n",
			
 
				       "     document indices in [74, 74) total of 0 documents\n",
			
 
				-      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_doc_idx.npy\n",
			
 
				-      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_sample_idx.npy\n",
			
 
				-      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_shuffle_idx.npy\n",
			
 
				+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
			
 
				+      " > only one epoch required, setting separate_last_epoch to False\n",
			
 
				+      " > elasped time to build and save doc-idx mapping (seconds): 0.002701\n",
			
 
				+      "    using:\n",
			
 
				+      "     number of documents:       70\n",
			
 
				+      "     number of epochs:          1\n",
			
 
				+      "     sequence length:           512\n",
			
 
				+      "     total number of samples:   141\n",
			
 
				+      " > elasped time to build and save sample-idx mapping (seconds): 0.003840\n",
			
 
				+      " > building shuffle index with split [0, 141) and [141, 141) ...\n",
			
 
				+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.001237\n",
			
 
				+      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
			
 
				+      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
			
 
				+      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
			
 
				       "    loaded indexed file in 0.003 seconds\n",
			
 
				       "    total number of samples: 142\n",
			
 
				       "    total number of epochs: 1\n",
			
 
				-      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_doc_idx.npy\n",
			
 
				-      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_sample_idx.npy\n",
			
 
				-      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_shuffle_idx.npy\n",
			
 
				+      " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
			
 
				+      " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
			
 
				+      " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
			
 
				       "    loaded indexed file in 0.003 seconds\n",
			
 
				-      "    total number of samples: 5122\n",
			
 
				-      "    total number of epochs: 660\n",
			
 
				+      "    total number of samples: 5153\n",
			
 
				+      "    total number of epochs: 664\n",
			
 
				+      "> building indices for blendable datasets ...\n",
			
 
				+      " > sample ratios:\n",
			
 
				+      "   dataset 0, input: 1, achieved: 1\n",
			
 
				+      "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
			
 
				+      "> building indices for blendable datasets ...\n",
			
 
				+      " > sample ratios:\n",
			
 
				+      "   dataset 0, input: 1, achieved: 1\n",
			
 
				+      "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
			
 
				       "> finished creating GPT datasets ...\n",
			
 
				-      "[after dataloaders are built] datetime: 2021-08-27 06:24:46 \n",
			
 
				+      "[after dataloaders are built] datetime: 2021-08-30 05:52:33 \n",
			
 
				       "done with setup ...\n",
			
 
				       "training ...\n",
			
 
				-      "time (ms) | model-and-optimizer-setup: 4013.85 | train/valid/test-data-iterators-setup: 2773.74\n",
			
 
				-      "[after training is done] datetime: 2021-08-27 06:24:46 \n",
			
 
				+      "time (ms) | model-and-optimizer-setup: 3872.16 | train/valid/test-data-iterators-setup: 2957.85\n",
			
 
				+      "[after training is done] datetime: 2021-08-30 05:52:34 \n",
			
 
				       "------------------------------------------------------------------------------------------------------------------\n",
			
 
				-      " validation loss at the end of training for val data | lm loss value: 1.124495E+01 | lm loss PPL: 7.649290E+04 | \n",
			
 
				+      " validation loss at the end of training for val data | lm loss value: 1.124461E+01 | lm loss PPL: 7.646642E+04 | \n",
			
 
				       "------------------------------------------------------------------------------------------------------------------\n",
			
 
				       "Processing events...\n",
			
 
				       "Capturing symbol files...\n",
			
 
				-      "Saving temporary \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdstrm\" file to disk...\n",
			
 
				+      "Saving temporary \"/tmp/nsys-report-c19d-0647-d480-f24c.qdstrm\" file to disk...\n",
			
 
				       "Creating final output files...\n",
			
 
				       "\n",
			
 
				       "Processing [==============================================================100%]\n",
			
 
				-      "Saved report file to \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdrep\"\n",
			
 
				+      "Saved report file to \"/tmp/nsys-report-c19d-0647-d480-f24c.qdrep\"\n",
			
 
				       "Report file moved to \"/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved.qdrep\"\n"
			
 
				      ]
			
 
				     }
			
@@ -496,7 +515,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "parallel-language",
			
 
				+   "id": "spoken-night",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "## Remember to copy and paste your output on Slack or Zoom\n",
			
@@ -505,7 +524,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "german-aberdeen",
			
 
				+   "id": "interpreted-escape",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",
			
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_SVGPT_BIG.sh
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_SVGPT_BIG.sh
@@ -11,7 +11,7 @@ VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json'
 
				 MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt'
			
 
				 PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path
			
 
				 
			
 
				-#### --------------- params in the following block are allowed to change -----------#### 
			
 
				+#### [TODO]--------------- params in the following block are allowed to change -----------#### 
			
 
				 WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
			
 
				 GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
			
 
				 
			
@@ -45,7 +45,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
 
				        --train-samples 100 \
			
 
				        --save $CHECKPOINT_PATH \
			
 
				        --load $CHECKPOINT_PATH \
			
 
				-       --data-path $DATA_PATH \
			
 
				+       --data-path 1. $DATA_PATH \
			
 
				        --vocab-file $VOCAB_FILE \
			
 
				        --merge-file $MERGE_FILE \
			
 
				        --data-impl mmap \