|
@@ -2,7 +2,7 @@
|
|
|
"cells": [
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "external-dating",
|
|
|
+ "id": "postal-promotion",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"# \n",
|
|
@@ -50,7 +50,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "signal-skiing",
|
|
|
+ "id": "deadly-windows",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"<a id=\"Rerun_Cell\"></a>"
|
|
@@ -59,7 +59,7 @@
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": 3,
|
|
|
- "id": "durable-closing",
|
|
|
+ "id": "constant-affairs",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -68,8 +68,8 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 34,
|
|
|
- "id": "formal-steam",
|
|
|
+ "execution_count": 2,
|
|
|
+ "id": "transsexual-costume",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
@@ -101,13 +101,13 @@
|
|
|
"\n",
|
|
|
"TENSOR_MP_SIZE=8\n",
|
|
|
"PIPELINE_MP_SIZE=1\n",
|
|
|
- "LAYERS=\n",
|
|
|
- "HIDDEN_SZ=\n",
|
|
|
- "NUM_ATTN_HEADS=\n",
|
|
|
- "MICRO_BZ=\n",
|
|
|
- "GLOBAL_BZ=\n",
|
|
|
- "SEQ_LEN=\n",
|
|
|
- "MAX_POS_EM=\n",
|
|
|
+ "LAYERS=32\n",
|
|
|
+ "HIDDEN_SZ=2048\n",
|
|
|
+ "NUM_ATTN_HEADS=32\n",
|
|
|
+ "MICRO_BZ=64\n",
|
|
|
+ "GLOBAL_BZ=512\n",
|
|
|
+ "SEQ_LEN=512\n",
|
|
|
+ "MAX_POS_EM=512\n",
|
|
|
"#### -------------------- end of blocks ------------------------#### \n",
|
|
|
"\n",
|
|
|
"export OMP_NUM_THREADS=1\n",
|
|
@@ -129,7 +129,7 @@
|
|
|
" --train-samples 100 \\\n",
|
|
|
" --save $CHECKPOINT_PATH \\\n",
|
|
|
" --load $CHECKPOINT_PATH \\\n",
|
|
|
- " --data-path $DATA_PATH \\\n",
|
|
|
+ " --data-path 1. $DATA_PATH \\\n",
|
|
|
" --vocab-file $VOCAB_FILE \\\n",
|
|
|
" --merge-file $MERGE_FILE \\\n",
|
|
|
" --data-impl mmap \\\n",
|
|
@@ -151,7 +151,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "facial-teacher",
|
|
|
+ "id": "constant-fighter",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"---\n",
|
|
@@ -162,7 +162,7 @@
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": 39,
|
|
|
- "id": "eastern-prairie",
|
|
|
+ "id": "weekly-mechanics",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
@@ -180,7 +180,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "three-remainder",
|
|
|
+ "id": "concrete-finland",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"---\n",
|
|
@@ -204,8 +204,8 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 35,
|
|
|
- "id": "weekly-number",
|
|
|
+ "execution_count": 3,
|
|
|
+ "id": "experienced-standing",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
@@ -221,15 +221,15 @@
|
|
|
"Initializing NVTX monkey patches\n",
|
|
|
"Initializing NVTX monkey patches\n",
|
|
|
"Initializing NVTX monkey patches\n",
|
|
|
+ "Initializing NVTX monkey patches\n",
|
|
|
+ "Initializing NVTX monkey patches\n",
|
|
|
"/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
" warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Initializing NVTX monkey patchesDone with NVTX monkey patching\n",
|
|
|
- "\n",
|
|
|
+ "Done with NVTX monkey patching\n",
|
|
|
+ "Initializing NVTX monkey patches\n",
|
|
|
"/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
" warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
"Done with NVTX monkey patching\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
"/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
" warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
"Done with NVTX monkey patching\n",
|
|
@@ -273,7 +273,7 @@
|
|
|
" consumed_valid_samples .......................... 0\n",
|
|
|
" data_impl ....................................... mmap\n",
|
|
|
" data_parallel_size .............................. 1\n",
|
|
|
- " data_path ....................................... ['../dataset/EN/NVblogs_text_document']\n",
|
|
|
+ " data_path ....................................... ['1.', '../dataset/EN/NVblogs_text_document']\n",
|
|
|
" dataloader_type ................................. single\n",
|
|
|
" DDP_impl ........................................ local\n",
|
|
|
" decoder_seq_length .............................. None\n",
|
|
@@ -399,7 +399,7 @@
|
|
|
"make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
"make: Nothing to be done for 'default'.\n",
|
|
|
"make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
- ">>> done with dataset index builder. Compilation time: 0.644 seconds\n",
|
|
|
+ ">>> done with dataset index builder. Compilation time: 0.622 seconds\n",
|
|
|
"> compiling and loading fused kernels ...\n",
|
|
|
"Detected CUDA files, patching ldflags\n",
|
|
|
"Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
@@ -419,24 +419,24 @@
|
|
|
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
|
|
"ninja: no work to do.\n",
|
|
|
"Loading extension module fused_mix_prec_layer_norm_cuda...\n",
|
|
|
- ">>> done with compiling and loading fused kernels. Compilation time: 29.070 seconds\n",
|
|
|
- "time to initialize megatron (seconds): 21.626\n",
|
|
|
- "[after megatron is initialized] datetime: 2021-08-27 06:24:37 \n",
|
|
|
+ ">>> done with compiling and loading fused kernels. Compilation time: 31.948 seconds\n",
|
|
|
+ "time to initialize megatron (seconds): 9.410\n",
|
|
|
+ "[after megatron is initialized] datetime: 2021-08-30 05:52:25 \n",
|
|
|
"building GPT model ...\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
|
|
|
" > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 215937024\n",
|
|
|
" > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 215937024\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
|
|
|
+ " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 215937024\n",
|
|
|
+ " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 215937024\n",
|
|
|
" > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 215937024\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024\n",
|
|
|
+ " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 215937024 > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 215937024\n",
|
|
|
+ "\n",
|
|
|
" > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 215937024\n",
|
|
|
"setting training iterations to 0\n",
|
|
|
"> learning rate decay style: cosine\n",
|
|
|
"WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n",
|
|
|
" will not load any checkpoints and will start from random\n",
|
|
|
- "time (ms) | load-checkpoint: 28.95\n",
|
|
|
- "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-27 06:24:41 \n",
|
|
|
+ "time (ms) | load-checkpoint: 54.82\n",
|
|
|
+ "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 05:52:29 \n",
|
|
|
"> building train, validation, and test datasets ...\n",
|
|
|
" > datasets target sizes (minimum size):\n",
|
|
|
" train: 100\n",
|
|
@@ -449,7 +449,7 @@
|
|
|
" reading document index...\n",
|
|
|
" creating numpy buffer of mmap...\n",
|
|
|
" creating memory view of numpy buffer...\n",
|
|
|
- " > finished creating indexed dataset in 0.003079 seconds\n",
|
|
|
+ " > finished creating indexed dataset in 0.005710 seconds\n",
|
|
|
" number of documents: 74\n",
|
|
|
" > dataset split:\n",
|
|
|
" train:\n",
|
|
@@ -458,34 +458,53 @@
|
|
|
" document indices in [70, 74) total of 4 documents\n",
|
|
|
" test:\n",
|
|
|
" document indices in [74, 74) total of 0 documents\n",
|
|
|
- " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_doc_idx.npy\n",
|
|
|
- " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_sample_idx.npy\n",
|
|
|
- " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_100ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
+ " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
|
|
|
+ " > only one epoch required, setting separate_last_epoch to False\n",
|
|
|
+ " > elasped time to build and save doc-idx mapping (seconds): 0.002701\n",
|
|
|
+ " using:\n",
|
|
|
+ " number of documents: 70\n",
|
|
|
+ " number of epochs: 1\n",
|
|
|
+ " sequence length: 512\n",
|
|
|
+ " total number of samples: 141\n",
|
|
|
+ " > elasped time to build and save sample-idx mapping (seconds): 0.003840\n",
|
|
|
+ " > building shuffle index with split [0, 141) and [141, 141) ...\n",
|
|
|
+ " > elasped time to build and save shuffle-idx mapping (seconds): 0.001237\n",
|
|
|
+ " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
|
|
|
+ " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
|
|
|
+ " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
" loaded indexed file in 0.003 seconds\n",
|
|
|
" total number of samples: 142\n",
|
|
|
" total number of epochs: 1\n",
|
|
|
- " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_doc_idx.npy\n",
|
|
|
- " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_sample_idx.npy\n",
|
|
|
- " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5120ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
+ " > loading doc-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
|
|
|
+ " > loading sample-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
|
|
|
+ " > loading shuffle-idx mapping from ../dataset/EN/NVblogs_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
" loaded indexed file in 0.003 seconds\n",
|
|
|
- " total number of samples: 5122\n",
|
|
|
- " total number of epochs: 660\n",
|
|
|
+ " total number of samples: 5153\n",
|
|
|
+ " total number of epochs: 664\n",
|
|
|
+ "> building indices for blendable datasets ...\n",
|
|
|
+ " > sample ratios:\n",
|
|
|
+ " dataset 0, input: 1, achieved: 1\n",
|
|
|
+ "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
|
|
|
+ "> building indices for blendable datasets ...\n",
|
|
|
+ " > sample ratios:\n",
|
|
|
+ " dataset 0, input: 1, achieved: 1\n",
|
|
|
+ "> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
|
|
|
"> finished creating GPT datasets ...\n",
|
|
|
- "[after dataloaders are built] datetime: 2021-08-27 06:24:46 \n",
|
|
|
+ "[after dataloaders are built] datetime: 2021-08-30 05:52:33 \n",
|
|
|
"done with setup ...\n",
|
|
|
"training ...\n",
|
|
|
- "time (ms) | model-and-optimizer-setup: 4013.85 | train/valid/test-data-iterators-setup: 2773.74\n",
|
|
|
- "[after training is done] datetime: 2021-08-27 06:24:46 \n",
|
|
|
+ "time (ms) | model-and-optimizer-setup: 3872.16 | train/valid/test-data-iterators-setup: 2957.85\n",
|
|
|
+ "[after training is done] datetime: 2021-08-30 05:52:34 \n",
|
|
|
"------------------------------------------------------------------------------------------------------------------\n",
|
|
|
- " validation loss at the end of training for val data | lm loss value: 1.124495E+01 | lm loss PPL: 7.649290E+04 | \n",
|
|
|
+ " validation loss at the end of training for val data | lm loss value: 1.124461E+01 | lm loss PPL: 7.646642E+04 | \n",
|
|
|
"------------------------------------------------------------------------------------------------------------------\n",
|
|
|
"Processing events...\n",
|
|
|
"Capturing symbol files...\n",
|
|
|
- "Saving temporary \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdstrm\" file to disk...\n",
|
|
|
+ "Saving temporary \"/tmp/nsys-report-c19d-0647-d480-f24c.qdstrm\" file to disk...\n",
|
|
|
"Creating final output files...\n",
|
|
|
"\n",
|
|
|
"Processing [==============================================================100%]\n",
|
|
|
- "Saved report file to \"/tmp/nsys-report-96a7-0101-ea4b-0ee5.qdrep\"\n",
|
|
|
+ "Saved report file to \"/tmp/nsys-report-c19d-0647-d480-f24c.qdrep\"\n",
|
|
|
"Report file moved to \"/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved.qdrep\"\n"
|
|
|
]
|
|
|
}
|
|
@@ -496,7 +515,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "parallel-language",
|
|
|
+ "id": "spoken-night",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"## Remember to copy and paste your output on Slack or Zoom\n",
|
|
@@ -505,7 +524,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "german-aberdeen",
|
|
|
+ "id": "interpreted-escape",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"-----\n",
|