|
@@ -2,7 +2,7 @@
|
|
|
"cells": [
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "boxed-privilege",
|
|
|
+ "id": "charged-allen",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"# \n",
|
|
@@ -50,7 +50,31 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "royal-holiday",
|
|
|
+ "id": "continuing-passport",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "---\n",
|
|
|
+ "# Hint :\n",
|
|
|
+ "### call out a terminal and type in **nvidia-smi** to monitor the GPUs' utils and power consumption \n",
|
|
|
+ "### remember to fill up the GPU memory\n",
|
|
|
+ "![call out a terminal ](./Megatron-LM/pics/Alt_callout2terminals.JPG)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "id": "corrected-bacteria",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "---\n",
|
|
|
+ "## modify and rerun the below to get a even bigger GPT model \n",
|
|
|
+ "<a id=\"MODIFY_CELL\"></a>\n",
|
|
|
+ "\n",
|
|
|
+ "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#Rerun_Cell\">Jump to ReRun Cell</a> "
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "id": "dramatic-opinion",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"<a id=\"Rerun_Cell\"></a>"
|
|
@@ -58,18 +82,18 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 30,
|
|
|
- "id": "opening-description",
|
|
|
+ "execution_count": 1,
|
|
|
+ "id": "massive-industry",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "!rm -fr ./Megatron-LM/sv_ckpt/* "
|
|
|
+ "!rm -fr ../sv_ckpt/* "
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 29,
|
|
|
- "id": "future-explorer",
|
|
|
+ "execution_count": 2,
|
|
|
+ "id": "understood-swimming",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
@@ -89,22 +113,22 @@
|
|
|
"NODE_RANK=0\n",
|
|
|
"\n",
|
|
|
"### modify this section to point the file to its own path \n",
|
|
|
- "CHECKPOINT_PATH='./Megatron-LM/sv_ckpt/'\n",
|
|
|
- "DATA_PATH='../dataset/SV/webnyheter2013_text_document'\n",
|
|
|
- "VOCAB_FILE='../dataset/SV/32k/vocab.json'\n",
|
|
|
- "MERGE_FILE='../dataset/SV/32k/merges.txt'\n",
|
|
|
- "PROFILE_OUTPUT_PATH='/home/zcharpy/profiles/DLprof/2ndrun/nsys_improved' # modify this to your own profile path\n",
|
|
|
+ "CHECKPOINT_PATH='../sv_ckpt/'\n",
|
|
|
+ "DATA_PATH='../dataset/SV/webnyheter2013_56kvocab_text_document'\n",
|
|
|
+ "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n",
|
|
|
+ "MERGE_FILE='../dataset/SV/56k/merges.txt'\n",
|
|
|
+ "PROFILE_OUTPUT_PATH='../profiles/SV/nsys_sv_' # modify this to your own profile path\n",
|
|
|
"\n",
|
|
|
"#### [TODO]--------------- params in the following block are allowed to change -----------#### \n",
|
|
|
- "WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
|
- "GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
|
+ "WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
|
+ "GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
|
"\n",
|
|
|
- "TENSOR_MP_SIZE=8\n",
|
|
|
+ "TENSOR_MP_SIZE=2\n",
|
|
|
"PIPELINE_MP_SIZE=1\n",
|
|
|
- "LAYERS=64\n",
|
|
|
- "HIDDEN_SZ=2048\n",
|
|
|
+ "LAYERS=32\n",
|
|
|
+ "HIDDEN_SZ=4096\n",
|
|
|
"NUM_ATTN_HEADS=32\n",
|
|
|
- "MICRO_BZ=64\n",
|
|
|
+ "MICRO_BZ=8\n",
|
|
|
"GLOBAL_BZ=512\n",
|
|
|
"SEQ_LEN=512\n",
|
|
|
"MAX_POS_EM=512\n",
|
|
@@ -151,26 +175,27 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "confident-prerequisite",
|
|
|
+ "id": "monetary-trial",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"---\n",
|
|
|
"## check how big is your model - \n",
|
|
|
- "I got 1 Billion :) what about you ?"
|
|
|
+ "modify the parameters in the [params_cnt.sh](./params_cnt.sh)\n",
|
|
|
+ "I got 6 Billion :) what about you ?"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 26,
|
|
|
- "id": "affecting-function",
|
|
|
+ "execution_count": 3,
|
|
|
+ "id": "afraid-promise",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
|
"name": "stdout",
|
|
|
"output_type": "stream",
|
|
|
"text": [
|
|
|
- "3\n",
|
|
|
- "3289513984\n"
|
|
|
+ "6\n",
|
|
|
+ "6675628032\n"
|
|
|
]
|
|
|
}
|
|
|
],
|
|
@@ -180,7 +205,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "hairy-dominican",
|
|
|
+ "id": "portuguese-freedom",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"---\n",
|
|
@@ -203,48 +228,37 @@
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "id": "competent-romania",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "---\n",
|
|
|
+ "# Re-run this cell below to get an even bigger GPT model\n",
|
|
|
+ "## remember to modify the [params count](./params_cnt.sh) to check how big is your model\n",
|
|
|
+ "## click the below to go back to Modify the profile_SVGPT_BIG.sh \n",
|
|
|
+ "<a href=\"./Day3-5_run_Megatron_with_varying_config.ipynb#MODIFY_CELL\">Jump back to modify and overwrite profile_SVGPT_BIG.sh </a> \n",
|
|
|
+ "<a id=\"Rerun_Cell\"></a>"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 31,
|
|
|
- "id": "acknowledged-brake",
|
|
|
+ "execution_count": 4,
|
|
|
+ "id": "injured-pasta",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
{
|
|
|
"name": "stdout",
|
|
|
"output_type": "stream",
|
|
|
"text": [
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "Initializing NVTX monkey patches\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
- " warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
+ "Initializing NVTX monkey patchesInitializing NVTX monkey patches\n",
|
|
|
+ "\n",
|
|
|
"/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
" warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
"/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:144: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead\n",
|
|
|
" warnings.warn(\"torch.distributed.reduce_op is deprecated, please use \"\n",
|
|
|
- "Done with NVTX monkey patching\n",
|
|
|
- "using world size: 8, data-parallel-size: 1, tensor-model-parallel size: 8, pipeline-model-parallel size: 1 \n",
|
|
|
+ "Done with NVTX monkey patchingDone with NVTX monkey patching\n",
|
|
|
+ "\n",
|
|
|
+ "using world size: 2, data-parallel-size: 1, tensor-model-parallel size: 2, pipeline-model-parallel size: 1 \n",
|
|
|
"using torch.float16 for parameters ...\n",
|
|
|
"------------------------ arguments ------------------------\n",
|
|
|
" accumulate_allreduce_grads_in_fp32 .............. False\n",
|
|
@@ -272,7 +286,7 @@
|
|
|
" consumed_valid_samples .......................... 0\n",
|
|
|
" data_impl ....................................... mmap\n",
|
|
|
" data_parallel_size .............................. 1\n",
|
|
|
- " data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_text_document']\n",
|
|
|
+ " data_path ....................................... ['1.', '../dataset/SV/webnyheter2013_56kvocab_text_document']\n",
|
|
|
" dataloader_type ................................. single\n",
|
|
|
" DDP_impl ........................................ local\n",
|
|
|
" decoder_seq_length .............................. None\n",
|
|
@@ -286,14 +300,14 @@
|
|
|
" evidence_data_path .............................. None\n",
|
|
|
" exit_duration_in_mins ........................... None\n",
|
|
|
" exit_interval ................................... None\n",
|
|
|
- " ffn_hidden_size ................................. 8192\n",
|
|
|
+ " ffn_hidden_size ................................. 16384\n",
|
|
|
" finetune ........................................ False\n",
|
|
|
" fp16 ............................................ True\n",
|
|
|
" fp16_lm_cross_entropy ........................... False\n",
|
|
|
" fp32_residual_connection ........................ False\n",
|
|
|
" global_batch_size ............................... 512\n",
|
|
|
" hidden_dropout .................................. 0.1\n",
|
|
|
- " hidden_size ..................................... 2048\n",
|
|
|
+ " hidden_size ..................................... 4096\n",
|
|
|
" hysteresis ...................................... 2\n",
|
|
|
" ict_head_size ................................... None\n",
|
|
|
" ict_load ........................................ None\n",
|
|
@@ -303,10 +317,10 @@
|
|
|
" init_method_std ................................. 0.02\n",
|
|
|
" init_method_xavier_uniform ...................... False\n",
|
|
|
" initial_loss_scale .............................. 4294967296\n",
|
|
|
- " kv_channels ..................................... 64\n",
|
|
|
+ " kv_channels ..................................... 128\n",
|
|
|
" layernorm_epsilon ............................... 1e-05\n",
|
|
|
" lazy_mpu_init ................................... None\n",
|
|
|
- " load ............................................ ./Megatron-LM/sv_ckpt/\n",
|
|
|
+ " load ............................................ ../sv_ckpt/\n",
|
|
|
" local_rank ...................................... 0\n",
|
|
|
" log_batch_size_to_tensorboard ................... False\n",
|
|
|
" log_interval .................................... 10\n",
|
|
@@ -329,8 +343,8 @@
|
|
|
" mask_prob ....................................... 0.15\n",
|
|
|
" masked_softmax_fusion ........................... True\n",
|
|
|
" max_position_embeddings ......................... 512\n",
|
|
|
- " merge_file ...................................... ../dataset/SV/32k/merges.txt\n",
|
|
|
- " micro_batch_size ................................ 64\n",
|
|
|
+ " merge_file ...................................... ../dataset/SV/56k/merges.txt\n",
|
|
|
+ " micro_batch_size ................................ 8\n",
|
|
|
" min_loss_scale .................................. 1.0\n",
|
|
|
" min_lr .......................................... 1e-05\n",
|
|
|
" mmap_warmup ..................................... False\n",
|
|
@@ -341,7 +355,7 @@
|
|
|
" num_attention_heads ............................. 32\n",
|
|
|
" num_channels .................................... 3\n",
|
|
|
" num_classes ..................................... 1000\n",
|
|
|
- " num_layers ...................................... 64\n",
|
|
|
+ " num_layers ...................................... 32\n",
|
|
|
" num_layers_per_virtual_pipeline_stage ........... None\n",
|
|
|
" num_workers ..................................... 2\n",
|
|
|
" onnx_safe ....................................... None\n",
|
|
@@ -360,7 +374,7 @@
|
|
|
" retriever_score_scaling ......................... False\n",
|
|
|
" retriever_seq_length ............................ 256\n",
|
|
|
" sample_rate ..................................... 1.0\n",
|
|
|
- " save ............................................ ./Megatron-LM/sv_ckpt/\n",
|
|
|
+ " save ............................................ ../sv_ckpt/\n",
|
|
|
" save_interval ................................... 100\n",
|
|
|
" scatter_gather_tensors_in_pipeline .............. True\n",
|
|
|
" seed ............................................ 1234\n",
|
|
@@ -368,7 +382,7 @@
|
|
|
" sgd_momentum .................................... 0.9\n",
|
|
|
" short_seq_prob .................................. 0.1\n",
|
|
|
" split ........................................... 949,50,1\n",
|
|
|
- " tensor_model_parallel_size ...................... 8\n",
|
|
|
+ " tensor_model_parallel_size ...................... 2\n",
|
|
|
" tensorboard_dir ................................. None\n",
|
|
|
" tensorboard_log_interval ........................ 1\n",
|
|
|
" tensorboard_queue_size .......................... 1000\n",
|
|
@@ -382,60 +396,54 @@
|
|
|
" use_one_sent_docs ............................... False\n",
|
|
|
" virtual_pipeline_model_parallel_size ............ None\n",
|
|
|
" vocab_extra_ids ................................. 0\n",
|
|
|
- " vocab_file ...................................... ../dataset/SV/32k/vocab.json\n",
|
|
|
+ " vocab_file ...................................... ../dataset/SV/56k/vocab.json\n",
|
|
|
" weight_decay .................................... 0.01\n",
|
|
|
- " world_size ...................................... 8\n",
|
|
|
+ " world_size ...................................... 2\n",
|
|
|
"-------------------- end of arguments ---------------------\n",
|
|
|
- "setting number of micro-batches to constant 8\n",
|
|
|
+ "setting number of micro-batches to constant 64\n",
|
|
|
"> building GPT2BPETokenizer tokenizer ...\n",
|
|
|
- " > padded vocab (size: 32000) with 768 dummy tokens (new size: 32768)\n",
|
|
|
+ " > padded vocab (size: 56000) with 64 dummy tokens (new size: 56064)\n",
|
|
|
"> initializing torch distributed ...\n",
|
|
|
- "> initializing tensor model parallel with size 8\n",
|
|
|
+ "> initializing tensor model parallel with size 2\n",
|
|
|
"> initializing pipeline model parallel with size 1\n",
|
|
|
"> setting random seeds to 1234 ...\n",
|
|
|
"> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234\n",
|
|
|
"> compiling dataset index builder ...\n",
|
|
|
- "make: Entering directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
+ "make: Entering directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
"make: Nothing to be done for 'default'.\n",
|
|
|
- "make: Leaving directory '/home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
- ">>> done with dataset index builder. Compilation time: 0.167 seconds\n",
|
|
|
+ "make: Leaving directory '/proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/data'\n",
|
|
|
+ ">>> done with dataset index builder. Compilation time: 0.145 seconds\n",
|
|
|
"> compiling and loading fused kernels ...\n",
|
|
|
"Detected CUDA files, patching ldflags\n",
|
|
|
- "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
+ "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
"Building extension module scaled_upper_triang_masked_softmax_cuda...\n",
|
|
|
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
|
|
"ninja: no work to do.\n",
|
|
|
"Loading extension module scaled_upper_triang_masked_softmax_cuda...\n",
|
|
|
"Detected CUDA files, patching ldflags\n",
|
|
|
- "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
+ "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
"Building extension module scaled_masked_softmax_cuda...\n",
|
|
|
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
|
|
"ninja: no work to do.\n",
|
|
|
"Loading extension module scaled_masked_softmax_cuda...\n",
|
|
|
"Detected CUDA files, patching ldflags\n",
|
|
|
- "Emitting ninja build file /home/zcharpy/bootcamp/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
+ "Emitting ninja build file /proj/guest_at_nsc/users/zcharpy/gpubootcamp/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/megatron/fused_kernels/build/build.ninja...\n",
|
|
|
"Building extension module fused_mix_prec_layer_norm_cuda...\n",
|
|
|
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
|
|
"ninja: no work to do.\n",
|
|
|
"Loading extension module fused_mix_prec_layer_norm_cuda...\n",
|
|
|
- ">>> done with compiling and loading fused kernels. Compilation time: 18.065 seconds\n",
|
|
|
- "time to initialize megatron (seconds): 90.261\n",
|
|
|
- "[after megatron is initialized] datetime: 2021-08-30 08:59:22 \n",
|
|
|
+ ">>> done with compiling and loading fused kernels. Compilation time: 2.868 seconds\n",
|
|
|
+ "time to initialize megatron (seconds): 43.936\n",
|
|
|
+ "[after megatron is initialized] datetime: 2021-09-15 11:55:55 \n",
|
|
|
"building GPT model ...\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (4, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (6, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (7, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (2, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (3, 0): 412995584\n",
|
|
|
- " > number of parameters on (tensor, pipeline) model parallel rank (5, 0): 412995584\n",
|
|
|
+ " > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 3339395072\n",
|
|
|
+ " > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 3339395072\n",
|
|
|
"setting training iterations to 0\n",
|
|
|
"> learning rate decay style: cosine\n",
|
|
|
- "WARNING: could not find the metadata file ./Megatron-LM/sv_ckpt/latest_checkpointed_iteration.txt \n",
|
|
|
+ "WARNING: could not find the metadata file ../sv_ckpt/latest_checkpointed_iteration.txt \n",
|
|
|
" will not load any checkpoints and will start from random\n",
|
|
|
- "time (ms) | load-checkpoint: 25.10\n",
|
|
|
- "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-08-30 08:59:28 \n",
|
|
|
+ "time (ms) | load-checkpoint: 2.66\n",
|
|
|
+ "[after model, optimizer, and learning rate scheduler are built] datetime: 2021-09-15 11:55:56 \n",
|
|
|
"> building train, validation, and test datasets ...\n",
|
|
|
" > datasets target sizes (minimum size):\n",
|
|
|
" train: 100\n",
|
|
@@ -448,7 +456,7 @@
|
|
|
" reading document index...\n",
|
|
|
" creating numpy buffer of mmap...\n",
|
|
|
" creating memory view of numpy buffer...\n",
|
|
|
- " > finished creating indexed dataset in 0.004143 seconds\n",
|
|
|
+ " > finished creating indexed dataset in 0.004941 seconds\n",
|
|
|
" number of documents: 1249010\n",
|
|
|
" > dataset split:\n",
|
|
|
" train:\n",
|
|
@@ -457,24 +465,57 @@
|
|
|
" document indices in [1185311, 1247761) total of 62450 documents\n",
|
|
|
" test:\n",
|
|
|
" document indices in [1247761, 1249010) total of 1249 documents\n",
|
|
|
- " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
|
|
|
- " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
|
|
|
- " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
+ " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
|
|
|
+ " > only one epoch required, setting separate_last_epoch to False\n",
|
|
|
+ " > elasped time to build and save doc-idx mapping (seconds): 0.066494\n",
|
|
|
+ " using:\n",
|
|
|
+ " number of documents: 1185311\n",
|
|
|
+ " number of epochs: 1\n",
|
|
|
+ " sequence length: 512\n",
|
|
|
+ " total number of samples: 51303\n",
|
|
|
+ " > elasped time to build and save sample-idx mapping (seconds): 0.008808\n",
|
|
|
+ " > building shuffle index with split [0, 51303) and [51303, 51303) ...\n",
|
|
|
+ " > elasped time to build and save shuffle-idx mapping (seconds): 0.002738\n",
|
|
|
+ " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_doc_idx.npy\n",
|
|
|
+ " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_sample_idx.npy\n",
|
|
|
+ " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_train_indexmap_101ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
" loaded indexed file in 0.005 seconds\n",
|
|
|
- " total number of samples: 53948\n",
|
|
|
+ " total number of samples: 51304\n",
|
|
|
" total number of epochs: 1\n",
|
|
|
- " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
|
|
|
- " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
|
|
|
- " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
- " loaded indexed file in 0.003 seconds\n",
|
|
|
- " total number of samples: 5695\n",
|
|
|
+ " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
|
|
|
+ " > last epoch number of samples (2438) is larger than 80% of number of samples per epoch (2708), setting separate_last_epoch to False\n",
|
|
|
+ " > elasped time to build and save doc-idx mapping (seconds): 0.005265\n",
|
|
|
+ " using:\n",
|
|
|
+ " number of documents: 62450\n",
|
|
|
+ " number of epochs: 2\n",
|
|
|
+ " sequence length: 512\n",
|
|
|
+ " total number of samples: 5416\n",
|
|
|
+ " > elasped time to build and save sample-idx mapping (seconds): 0.001357\n",
|
|
|
+ " > building shuffle index with split [0, 5416) and [5416, 5416) ...\n",
|
|
|
+ " > elasped time to build and save shuffle-idx mapping (seconds): 0.002597\n",
|
|
|
+ " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
|
|
|
+ " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
|
|
|
+ " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_valid_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
+ " loaded indexed file in 0.002 seconds\n",
|
|
|
+ " total number of samples: 5417\n",
|
|
|
" total number of epochs: 2\n",
|
|
|
- " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
|
|
|
- " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
|
|
|
- " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
- " loaded indexed file in 0.003 seconds\n",
|
|
|
- " total number of samples: 5192\n",
|
|
|
- " total number of epochs: 91\n",
|
|
|
+ " > WARNING: could not find index map files, building the indices on rank 0 ...\n",
|
|
|
+ " > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (54), setting separate_last_epoch to True\n",
|
|
|
+ " > elasped time to build and save doc-idx mapping (seconds): 0.004714\n",
|
|
|
+ " using:\n",
|
|
|
+ " number of documents: 1249\n",
|
|
|
+ " number of epochs: 96\n",
|
|
|
+ " sequence length: 512\n",
|
|
|
+ " total number of samples: 5188\n",
|
|
|
+ " > elasped time to build and save sample-idx mapping (seconds): 0.001624\n",
|
|
|
+ " > building shuffle index with split [0, 5134) and [5134, 5188) ...\n",
|
|
|
+ " > elasped time to build and save shuffle-idx mapping (seconds): 0.001298\n",
|
|
|
+ " > loading doc-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_doc_idx.npy\n",
|
|
|
+ " > loading sample-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_sample_idx.npy\n",
|
|
|
+ " > loading shuffle-idx mapping from ../dataset/SV/webnyheter2013_56kvocab_text_document_test_indexmap_5146ns_512sl_1234s_shuffle_idx.npy\n",
|
|
|
+ " loaded indexed file in 0.002 seconds\n",
|
|
|
+ " total number of samples: 5189\n",
|
|
|
+ " total number of epochs: 96\n",
|
|
|
"> building indices for blendable datasets ...\n",
|
|
|
" > sample ratios:\n",
|
|
|
" dataset 0, input: 1, achieved: 1\n",
|
|
@@ -488,17 +529,17 @@
|
|
|
" dataset 0, input: 1, achieved: 1\n",
|
|
|
"> elapsed time for building blendable dataset indices: 0.00 (sec)\n",
|
|
|
"> finished creating GPT datasets ...\n",
|
|
|
- "[after dataloaders are built] datetime: 2021-08-30 08:59:32 \n",
|
|
|
+ "[after dataloaders are built] datetime: 2021-09-15 11:55:58 \n",
|
|
|
"done with setup ...\n",
|
|
|
"training ...\n",
|
|
|
- "time (ms) | model-and-optimizer-setup: 6065.80 | train/valid/test-data-iterators-setup: 2661.91\n",
|
|
|
- "[after training is done] datetime: 2021-08-30 08:59:32 \n",
|
|
|
+ "time (ms) | model-and-optimizer-setup: 929.42 | train/valid/test-data-iterators-setup: 1004.53\n",
|
|
|
+ "[after training is done] datetime: 2021-09-15 11:55:58 \n",
|
|
|
"------------------------------------------------------------------------------------------------------------------\n",
|
|
|
- " validation loss at the end of training for val data | lm loss value: 1.081321E+01 | lm loss PPL: 4.967259E+04 | \n",
|
|
|
+ " validation loss at the end of training for val data | lm loss value: 1.171452E+01 | lm loss PPL: 1.223352E+05 | \n",
|
|
|
"------------------------------------------------------------------------------------------------------------------\n",
|
|
|
"Evaluating iter 10/10\n",
|
|
|
"-------------------------------------------------------------------------------------------------------------------\n",
|
|
|
- " validation loss at the end of training for test data | lm loss value: 1.081394E+01 | lm loss PPL: 4.970880E+04 | \n",
|
|
|
+ " validation loss at the end of training for test data | lm loss value: 1.171400E+01 | lm loss PPL: 1.222719E+05 | \n",
|
|
|
"-------------------------------------------------------------------------------------------------------------------\n"
|
|
|
]
|
|
|
}
|
|
@@ -509,7 +550,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "determined-right",
|
|
|
+ "id": "entertaining-transparency",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"## Remember to copy and paste your output on Slack or Zoom\n",
|
|
@@ -518,7 +559,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
- "id": "searching-worthy",
|
|
|
+ "id": "hidden-minister",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"-----\n",
|