| 
					
				 | 
			
			
				@@ -2,7 +2,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  "cells": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "boxed-privilege", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "charged-allen", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "# \n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -50,7 +50,31 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "royal-holiday", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "continuing-passport", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "---\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "# Hint :\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "### call out a terminal and type in **nvidia-smi** to monitor the GPUs' utils and power consumption \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "### remember to fill up the GPU memory\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "corrected-bacteria", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "---\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "## modify and rerun the below to get a even bigger GPT model \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "<a id=\"MODIFY_CELL\"></a>\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#Rerun_Cell\">Jump to ReRun Cell</a> " 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "dramatic-opinion", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "<a id=\"Rerun_Cell\"></a>" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -58,18 +82,18 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 30, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "opening-description", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": 1, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "massive-industry", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "!rm -fr ./Megatron-LM/sv_ckpt/* " 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "!rm -fr ../sv_ckpt/* " 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 29, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "future-explorer", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": 2, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "understood-swimming", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     { 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -89,22 +113,22 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "NODE_RANK=0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "### modify this section to point the file to its own path \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "DATA_PATH='../dataset/SV/webnyheter2013_text_document'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "VOCAB_FILE='../dataset/SV/32k/vocab.json'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "MERGE_FILE='../dataset/SV/32k/merges.txt'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "CHECKPOINT_PATH='../sv_ckpt/'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "DATA_PATH='../dataset/SV/webnyheter2013_56kvocab_text_document'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "MERGE_FILE='../dataset/SV/56k/merges.txt'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "PROFILE_OUTPUT_PATH='../profiles/SV/nsys_sv_' # modify this to your own profile path\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "#### [TODO]--------------- params in the following block are allowed to change -----------#### \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "TENSOR_MP_SIZE=8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "TENSOR_MP_SIZE=2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "PIPELINE_MP_SIZE=1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "LAYERS=64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "HIDDEN_SZ=2048\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "LAYERS=32\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "HIDDEN_SZ=4096\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "NUM_ATTN_HEADS=32\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "MICRO_BZ=64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "MICRO_BZ=8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "GLOBAL_BZ=512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "SEQ_LEN=512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "MAX_POS_EM=512\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -151,26 +175,27 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "confident-prerequisite", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "monetary-trial", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "---\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "## check how big is your model - \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "I got 1 Billion :)  what about you ?" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "modify the parameters in the [params_cnt.sh](./params_cnt.sh)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "I got 6 Billion :)  what about you ?" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 26, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "affecting-function", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": 3, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "afraid-promise", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "name": "stdout", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "output_type": "stream", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "text": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "3\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "3289513984\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "6\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "6675628032\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ], 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -180,7 +205,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "hairy-dominican", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "portuguese-freedom", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "---\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -203,48 +228,37 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "competent-romania", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "---\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "# Re-run this cell below to get an even bigger GPT model\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "## remember to modify the [params count](./params_cnt.sh) to check how big is your model\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "## click the below to go back to Modify the profile_SVGPT_BIG.sh \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#MODIFY_CELL\">Jump back to modify and overwrite profile_SVGPT_BIG.sh </a> \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "<a id=\"Rerun_Cell\"></a>" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 31, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "acknowledged-brake", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": 4, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "injured-pasta", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "name": "stdout", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "output_type": "stream", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      "text": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Initializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "Initializing NVTX monkey patchesInitializing NVTX monkey patches\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Done with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "using world size: 8, data-parallel-size: 1, tensor-model-parallel size: 8, pipeline-model-parallel size: 1 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "Done with NVTX monkey patchingDone with NVTX monkey patching\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "using world size: 2, data-parallel-size: 1, tensor-model-parallel size: 2, pipeline-model-parallel size: 1 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "using torch.float16 for parameters ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "------------------------ arguments ------------------------\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  accumulate_allreduce_grads_in_fp32 .............. False\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -272,7 +286,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  consumed_valid_samples .......................... 0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  data_impl ....................................... mmap\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  data_parallel_size .............................. 1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_text_document']\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_56kvocab_text_document']\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  dataloader_type ................................. single\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  DDP_impl ........................................ local\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  decoder_seq_length .............................. None\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -286,14 +300,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  evidence_data_path .............................. None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  exit_duration_in_mins ........................... None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  exit_interval ................................... None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  ffn_hidden_size ................................. 8192\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  ffn_hidden_size ................................. 16384\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  finetune ........................................ False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  fp16 ............................................ True\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  fp16_lm_cross_entropy ........................... False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  fp32_residual_connection ........................ False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  global_batch_size ............................... 512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  hidden_dropout .................................. 0.1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  hidden_size ..................................... 2048\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  hidden_size ..................................... 4096\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  hysteresis ...................................... 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  ict_head_size ................................... None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  ict_load ........................................ None\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -303,10 +317,10 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  init_method_std ................................. 0.02\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  init_method_xavier_uniform ...................... False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  initial_loss_scale .............................. 4294967296\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  kv_channels ..................................... 64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  kv_channels ..................................... 128\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  layernorm_epsilon ............................... 1e-05\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  lazy_mpu_init ................................... None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  load ............................................ ./Megatron-LM/sv_ckpt/\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  load ............................................ ../sv_ckpt/\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  local_rank ...................................... 0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  log_batch_size_to_tensorboard ................... False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  log_interval .................................... 10\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -329,8 +343,8 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  mask_prob ....................................... 0.15\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  masked_softmax_fusion ........................... True\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  max_position_embeddings ......................... 512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  merge_file ...................................... ../dataset/SV/32k/merges.txt\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  micro_batch_size ................................ 64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  merge_file ...................................... ../dataset/SV/56k/merges.txt\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  micro_batch_size ................................ 8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  min_loss_scale .................................. 1.0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  min_lr .......................................... 1e-05\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  mmap_warmup ..................................... False\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -341,7 +355,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  num_attention_heads ............................. 32\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  num_channels .................................... 3\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  num_classes ..................................... 1000\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  num_layers ...................................... 64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  num_layers ...................................... 32\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  num_layers_per_virtual_pipeline_stage ........... None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  num_workers ..................................... 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  onnx_safe ....................................... None\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -360,7 +374,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  retriever_score_scaling ......................... False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  retriever_seq_length ............................ 256\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  sample_rate ..................................... 1.0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  save ............................................ ./Megatron-LM/sv_ckpt/\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  save ............................................ ../sv_ckpt/\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  save_interval ................................... 100\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  scatter_gather_tensors_in_pipeline .............. True\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  seed ............................................ 1234\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -368,7 +382,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  sgd_momentum .................................... 0.9\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  short_seq_prob .................................. 0.1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  split ........................................... 949,50,1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  tensor_model_parallel_size ...................... 8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  tensor_model_parallel_size ...................... 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  tensorboard_dir ................................. None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  tensorboard_log_interval ........................ 1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  tensorboard_queue_size .......................... 1000\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -382,60 +396,54 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  use_one_sent_docs ............................... False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  virtual_pipeline_model_parallel_size ............ None\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  vocab_extra_ids ................................. 0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  vocab_file ...................................... ../dataset/SV/32k/vocab.json\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  vocab_file ...................................... ../dataset/SV/56k/vocab.json\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "  weight_decay .................................... 0.01\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "  world_size ...................................... 8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "  world_size ...................................... 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "-------------------- end of arguments ---------------------\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "setting number of micro-batches to constant 8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "setting number of micro-batches to constant 64\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> building GPT2BPETokenizer tokenizer ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > padded vocab (size: 32000) with 768 dummy tokens (new size: 32768)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > padded vocab (size: 56000) with 64 dummy tokens (new size: 56064)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> initializing torch distributed ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "> initializing tensor model parallel with size 8\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "> initializing tensor model parallel with size 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> initializing pipeline model parallel with size 1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> setting random seeds to 1234 ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> compiling dataset index builder ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "make: Entering directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "make: Nothing to be done for 'default'.\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      ">>> done with dataset index builder. Compilation time: 0.167 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "make: Leaving directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      ">>> done with dataset index builder. Compilation time: 0.145 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> compiling and loading fused kernels ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Detected CUDA files, patching ldflags\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Building extension module scaled_upper_triang_masked_softmax_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "ninja: no work to do.\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Loading extension module scaled_upper_triang_masked_softmax_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Detected CUDA files, patching ldflags\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Building extension module scaled_masked_softmax_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "ninja: no work to do.\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Loading extension module scaled_masked_softmax_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Detected CUDA files, patching ldflags\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Building extension module fused_mix_prec_layer_norm_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "ninja: no work to do.\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Loading extension module fused_mix_prec_layer_norm_cuda...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      ">>> done with compiling and loading fused kernels. Compilation time: 18.065 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "time to initialize megatron (seconds): 90.261\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "[after megatron is initialized] datetime: 2021-08-30 08:59:22 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      ">>> done with compiling and loading fused kernels. Compilation time: 2.868 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "time to initialize megatron (seconds): 43.936\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "[after megatron is initialized] datetime: 2021-09-15 11:55:55 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "building GPT model ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 412995584\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 3339395072\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 3339395072\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "setting training iterations to 0\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> learning rate decay style: cosine\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "WARNING: could not find the metadata file ../sv_ckpt/latest_checkpointed_iteration.txt \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    will not load any checkpoints and will start from random\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "time (ms) | load-checkpoint: 25.10\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 08:59:28 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "time (ms) | load-checkpoint: 2.66\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-09-15 11:55:56 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> building train, validation, and test datasets ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       " > datasets target sizes (minimum size):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    train:      100\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -448,7 +456,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    reading document index...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    creating numpy buffer of mmap...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    creating memory view of numpy buffer...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > finished creating indexed dataset in 0.004143 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > finished creating indexed dataset in 0.004941 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    number of documents: 1249010\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       " > dataset split:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    train:\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -457,24 +465,57 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "     document indices in [1185311, 1247761) total of 62450 documents\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    test:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "     document indices in [1247761, 1249010) total of 1249 documents\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > only one epoch required, setting separate_last_epoch to False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save doc-idx mapping (seconds): 0.066494\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    using:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of documents:       1185311\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of epochs:          1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     sequence length:           512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     total number of samples:   51303\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save sample-idx mapping (seconds): 0.008808\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > building shuffle index with split [0, 51303) and [51303, 51303) ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.002738\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    loaded indexed file in 0.005 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    total number of samples: 53948\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    total number of samples: 51304\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    total number of epochs: 1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    loaded indexed file in 0.003 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    total number of samples: 5695\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > last epoch number of samples (2438) is larger than 80% of number of samples per epoch (2708), setting separate_last_epoch to False\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save doc-idx mapping (seconds): 0.005265\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    using:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of documents:       62450\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of epochs:          2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     sequence length:           512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     total number of samples:   5416\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save sample-idx mapping (seconds): 0.001357\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > building shuffle index with split [0, 5416) and [5416, 5416) ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.002597\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    loaded indexed file in 0.002 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    total number of samples: 5417\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "    total number of epochs: 2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    loaded indexed file in 0.003 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    total number of samples: 5192\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "    total number of epochs: 91\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > WARNING: could not find index map files, building the indices on rank 0 ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (54), setting separate_last_epoch to True\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save doc-idx mapping (seconds): 0.004714\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    using:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of documents:       1249\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     number of epochs:          96\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     sequence length:           512\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "     total number of samples:   5188\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save sample-idx mapping (seconds): 0.001624\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > building shuffle index with split [0, 5134) and [5134, 5188) ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > elasped time to build and save shuffle-idx mapping (seconds): 0.001298\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    loaded indexed file in 0.002 seconds\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    total number of samples: 5189\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "    total number of epochs: 96\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> building indices for blendable datasets ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       " > sample ratios:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "   dataset 0, input: 1, achieved: 1\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -488,17 +529,17 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "   dataset 0, input: 1, achieved: 1\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> elapsed time for building blendable dataset indices: 0.00 (sec)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "> finished creating GPT datasets ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "[after dataloaders are built] datetime: 2021-08-30 08:59:32 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "[after dataloaders are built] datetime: 2021-09-15 11:55:58 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "done with setup ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "training ...\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "time (ms) | model-and-optimizer-setup: 6065.80 | train/valid/test-data-iterators-setup: 2661.91\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "[after training is done] datetime: 2021-08-30 08:59:32 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "time (ms) | model-and-optimizer-setup: 929.42 | train/valid/test-data-iterators-setup: 1004.53\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      "[after training is done] datetime: 2021-09-15 11:55:58 \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "------------------------------------------------------------------------------------------------------------------\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " validation loss at the end of training for val data | lm loss value: 1.081321E+01 | lm loss PPL: 4.967259E+04 | \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " validation loss at the end of training for val data | lm loss value: 1.171452E+01 | lm loss PPL: 1.223352E+05 | \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "------------------------------------------------------------------------------------------------------------------\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "Evaluating iter 10/10\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "-------------------------------------------------------------------------------------------------------------------\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      " validation loss at the end of training for test data | lm loss value: 1.081394E+01 | lm loss PPL: 4.970880E+04 | \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      " validation loss at the end of training for test data | lm loss value: 1.171400E+01 | lm loss PPL: 1.222719E+05 | \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       "-------------------------------------------------------------------------------------------------------------------\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				      ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     } 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -509,7 +550,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "determined-right", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "entertaining-transparency", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "## Remember to copy and paste your output on Slack or Zoom\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -518,7 +559,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "searching-worthy", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "hidden-minister", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "-----\n", 
			 |