|
@@ -2,7 +2,7 @@
|
|
"cells": [
|
|
"cells": [
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "rising-software",
|
|
|
|
|
|
+ "id": "alike-prisoner",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"## Scale up model size\n",
|
|
"## Scale up model size\n",
|
|
@@ -37,7 +37,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "historic-eating",
|
|
|
|
|
|
+ "id": "material-finland",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
@@ -47,7 +47,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "cleared-toolbox",
|
|
|
|
|
|
+ "id": "driven-drawing",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"Modify and rerun the code blocks below to obtain a even bigger GPT model. \n",
|
|
"Modify and rerun the code blocks below to obtain a even bigger GPT model. \n",
|
|
@@ -59,7 +59,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "large-buying",
|
|
|
|
|
|
+ "id": "proprietary-marketing",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"<a id=\"MODIFY_CELL\"></a>"
|
|
"<a id=\"MODIFY_CELL\"></a>"
|
|
@@ -67,7 +67,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "approved-beatles",
|
|
|
|
|
|
+ "id": "adjustable-engineer",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"Always clean the checkpoint folder to ensure trainining start from scratch."
|
|
"Always clean the checkpoint folder to ensure trainining start from scratch."
|
|
@@ -75,71 +75,64 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 1,
|
|
|
|
- "id": "attended-vault",
|
|
|
|
|
|
+ "execution_count": null,
|
|
|
|
+ "id": "other-parts",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
- "!rm -fr ../sv_ckpt/* "
|
|
|
|
|
|
+ "!rm -fr ../sv_ckpt/* \n",
|
|
|
|
+ "!rm -fr ../dataset/SV/*.npy"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 2,
|
|
|
|
- "id": "engaging-ocean",
|
|
|
|
|
|
+ "execution_count": null,
|
|
|
|
+ "id": "invisible-pepper",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
- "outputs": [
|
|
|
|
- {
|
|
|
|
- "name": "stdout",
|
|
|
|
- "output_type": "stream",
|
|
|
|
- "text": [
|
|
|
|
- "Overwriting ./Megatron-LM/profile_SVGPT_BIG.sh\n"
|
|
|
|
- ]
|
|
|
|
- }
|
|
|
|
- ],
|
|
|
|
|
|
+ "outputs": [],
|
|
"source": [
|
|
"source": [
|
|
"%%writefile ./Megatron-LM/SV_GPT_goingBIG.sh\n",
|
|
"%%writefile ./Megatron-LM/SV_GPT_goingBIG.sh\n",
|
|
"# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.\n",
|
|
"# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.\n",
|
|
|
|
+ "# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.\n",
|
|
|
|
+ "GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
|
|
+ "# Change for multinode config\n",
|
|
"MASTER_ADDR=localhost\n",
|
|
"MASTER_ADDR=localhost\n",
|
|
"MASTER_PORT=6000\n",
|
|
"MASTER_PORT=6000\n",
|
|
"NNODES=1 #<-- currently we are using 1 node multigpus\n",
|
|
"NNODES=1 #<-- currently we are using 1 node multigpus\n",
|
|
"NODE_RANK=0\n",
|
|
"NODE_RANK=0\n",
|
|
- "WORLD_SIZE=2 \n",
|
|
|
|
- "GPUS_PER_NODE=2 \n",
|
|
|
|
- "\n",
|
|
|
|
|
|
+ "WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system\n",
|
|
"\n",
|
|
"\n",
|
|
- "CHECKPOINT_PATH='../sv_ckpt/'\n",
|
|
|
|
- "DATA_PATH='../dataset/SV/webnyheter2013_56kvocab_text_document'\n",
|
|
|
|
- "VOCAB_FILE='../dataset/SV/56k/vocab.json'\n",
|
|
|
|
- "MERGE_FILE='../dataset/SV/56k/merges.txt'\n",
|
|
|
|
- "PROFILE_OUTPUT_PATH='../profiles/SV/nsys_sv_' # modify this to your own profile path\n",
|
|
|
|
- "\n",
|
|
|
|
- "# -------------------- ##### Begin of modifiable block ##### -------------------- \n",
|
|
|
|
|
|
+ "### modify this section to point the file to its own path \n",
|
|
|
|
+ "CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it \n",
|
|
|
|
+ "DATA_PATH='../dataset/SV/webnyheter2013_32kvocab_text_document' ## modify this path if you customize it \n",
|
|
|
|
+ "VOCAB_FILE='../dataset/SV/32k/vocab.json' ## modify this path if you customize it \n",
|
|
|
|
+ "MERGE_FILE='../dataset/SV/32k/merges.txt' ## modify this path if you customize it \n",
|
|
|
|
+ "PROFILE_OUTPUT_PATH='../profiles/SV/nsys_improved2' # modify this to your own profile path\n",
|
|
"\n",
|
|
"\n",
|
|
|
|
+ "################ Beginning of modifiable section ####################\n",
|
|
"TENSOR_MP_SIZE=<FILL_IN>\n",
|
|
"TENSOR_MP_SIZE=<FILL_IN>\n",
|
|
"PIPELINE_MP_SIZE=<FILL_IN>\n",
|
|
"PIPELINE_MP_SIZE=<FILL_IN>\n",
|
|
- "LAYERS=<FILL_IN>\n",
|
|
|
|
- "HIDDEN_SZ=<FILL_IN>\n",
|
|
|
|
|
|
+ "NUM_LYS=<FILL_IN>\n",
|
|
|
|
+ "HIDDEN_SIZE=<FILL_IN>\n",
|
|
"NUM_ATTN_HEADS=<FILL_IN>\n",
|
|
"NUM_ATTN_HEADS=<FILL_IN>\n",
|
|
- "MICRO_BZ=<FILL_IN>\n",
|
|
|
|
- "GLOBAL_BZ=<FILL_IN>\n",
|
|
|
|
"SEQ_LEN=<FILL_IN>\n",
|
|
"SEQ_LEN=<FILL_IN>\n",
|
|
"MAX_POS_EM=<FILL_IN>\n",
|
|
"MAX_POS_EM=<FILL_IN>\n",
|
|
|
|
+ "MICRO_BZ=<FILL_IN>\n",
|
|
|
|
+ "GLOBAL_BZ=<FILL_IN>\n",
|
|
"\n",
|
|
"\n",
|
|
- "# -------------------- ##### End of modifiable blocks ##### ------------------------ \n",
|
|
|
|
|
|
+ "############## end of modifiable sectio, do NOT modify anything below this line ####################\n",
|
|
"\n",
|
|
"\n",
|
|
- "################## DO NOT modify anything below this line ##################\n",
|
|
|
|
"export OMP_NUM_THREADS=1\n",
|
|
"export OMP_NUM_THREADS=1\n",
|
|
"DISTRIBUTED_ARGS=\"--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT\"\n",
|
|
"DISTRIBUTED_ARGS=\"--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT\"\n",
|
|
"\n",
|
|
"\n",
|
|
- "## We turn off nsys profiling decoration to avoid the small overhead\n",
|
|
|
|
|
|
+ "## for nsys run\n",
|
|
"#nsys profile --stats=false --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt,nvtx -o $PROFILE_OUTPUT_PATH \\\n",
|
|
"#nsys profile --stats=false --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt,nvtx -o $PROFILE_OUTPUT_PATH \\\n",
|
|
"python -m torch.distributed.launch $DISTRIBUTED_ARGS \\\n",
|
|
"python -m torch.distributed.launch $DISTRIBUTED_ARGS \\\n",
|
|
- " ./Megatron-LM/Dlprof_pretrain_gpt.py \\\n",
|
|
|
|
|
|
+ " ./Megatron-LM/pretrain_gpt.py \\\n",
|
|
" --tensor-model-parallel-size ${TENSOR_MP_SIZE} \\\n",
|
|
" --tensor-model-parallel-size ${TENSOR_MP_SIZE} \\\n",
|
|
" --pipeline-model-parallel-size ${PIPELINE_MP_SIZE} \\\n",
|
|
" --pipeline-model-parallel-size ${PIPELINE_MP_SIZE} \\\n",
|
|
- " --num-layers ${LAYERS} \\\n",
|
|
|
|
- " --hidden-size ${HIDDEN_SZ} \\\n",
|
|
|
|
|
|
+ " --num-layers ${NUM_LYS} \\\n",
|
|
|
|
+ " --hidden-size ${HIDDEN_SIZE} \\\n",
|
|
" --num-attention-heads ${NUM_ATTN_HEADS} \\\n",
|
|
" --num-attention-heads ${NUM_ATTN_HEADS} \\\n",
|
|
" --micro-batch-size ${MICRO_BZ} \\\n",
|
|
" --micro-batch-size ${MICRO_BZ} \\\n",
|
|
" --global-batch-size ${GLOBAL_BZ} \\\n",
|
|
" --global-batch-size ${GLOBAL_BZ} \\\n",
|
|
@@ -148,7 +141,7 @@
|
|
" --train-samples 100 \\\n",
|
|
" --train-samples 100 \\\n",
|
|
" --save ${CHECKPOINT_PATH} \\\n",
|
|
" --save ${CHECKPOINT_PATH} \\\n",
|
|
" --load ${CHECKPOINT_PATH} \\\n",
|
|
" --load ${CHECKPOINT_PATH} \\\n",
|
|
- " --data-path 1. ${DATA_PATH} \\\n",
|
|
|
|
|
|
+ " --data-path ${DATA_PATH} \\\n",
|
|
" --vocab-file ${VOCAB_FILE} \\\n",
|
|
" --vocab-file ${VOCAB_FILE} \\\n",
|
|
" --merge-file ${MERGE_FILE} \\\n",
|
|
" --merge-file ${MERGE_FILE} \\\n",
|
|
" --data-impl mmap \\\n",
|
|
" --data-impl mmap \\\n",
|
|
@@ -170,12 +163,12 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "determined-cliff",
|
|
|
|
|
|
+ "id": "formal-turner",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
- "Check how big is your model. By modify the parameters in the [params_cnt.sh](./params_cnt.sh)\n",
|
|
|
|
|
|
+ "Check how big is your model. By modify the parameters in the [params_cnt.sh](./params_cnt.sh) to match the training parames above.\n",
|
|
"\n",
|
|
"\n",
|
|
- "I got 6.6 Billion :) what about you ?\n",
|
|
|
|
|
|
+ "I got 1.6 Billion :) what about you ?\n",
|
|
"\n",
|
|
"\n",
|
|
"Modify the [params count](./params_cnt.sh) accoring to your training configuration.\n",
|
|
"Modify the [params count](./params_cnt.sh) accoring to your training configuration.\n",
|
|
"\n",
|
|
"\n",
|
|
@@ -185,7 +178,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"execution_count": null,
|
|
- "id": "green-magic",
|
|
|
|
|
|
+ "id": "welcome-donor",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -194,18 +187,18 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "awful-candle",
|
|
|
|
|
|
+ "id": "noticed-trinity",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"Below is an example of expected outputs:\n",
|
|
"Below is an example of expected outputs:\n",
|
|
" \n",
|
|
" \n",
|
|
- " 6 <-- One could get different number depend on your training config\n",
|
|
|
|
- " 6675628032 <-- One could get different number depend on your training config\n"
|
|
|
|
|
|
+ " 1 <-- One could get different number depend on your training config\n",
|
|
|
|
+ " 1678049280 <-- One could get different number depend on your training config\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "great-league",
|
|
|
|
|
|
+ "id": "convenient-ontario",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"Re-run this cell below to get an even bigger GPT model\n",
|
|
"Re-run this cell below to get an even bigger GPT model\n",
|
|
@@ -220,16 +213,16 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"execution_count": null,
|
|
- "id": "italian-karma",
|
|
|
|
|
|
+ "id": "representative-kentucky",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
- "!./Megatron-LM/SV_GPT_goingBIG.sh"
|
|
|
|
|
|
+ "!bash ./Megatron-LM/SV_GPT_goingBIG.sh"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "outstanding-application",
|
|
|
|
|
|
+ "id": "unnecessary-african",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"Below is an example of expected outputs:\n",
|
|
"Below is an example of expected outputs:\n",
|
|
@@ -252,7 +245,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "blessed-grammar",
|
|
|
|
|
|
+ "id": "pretty-handle",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"---\n",
|
|
"---\n",
|
|
@@ -263,7 +256,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "velvet-nylon",
|
|
|
|
|
|
+ "id": "caroline-induction",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"-----\n",
|
|
"-----\n",
|
|
@@ -272,7 +265,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "framed-blood",
|
|
|
|
|
|
+ "id": "ranking-pillow",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"-----\n",
|
|
"-----\n",
|