瀏覽代碼

adapted to 2 A100 gpus profiling run

zenodia 2 年之前
父節點
當前提交
d78548f255

File diff suppressed because it is too large
+ 145 - 162
ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb


+ 8 - 8
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/profile_2nd_run.sh

@@ -1,12 +1,12 @@
 # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
-GPUS_PER_NODE=8 # <--- remember to change the number of GPUs you actually have in your system
+GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1 #<-- currently we are using 1 node multigpus
 NODE_RANK=0
-WORLD_SIZE=8 # <--- remember to change the number of GPUs you actually have in your system
-TENSOR_MP_SIZE=8
+WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system
+TENSOR_MP_SIZE=2
 PIPELINE_MP_SIZE=1
 ### modify this section to point the file to its own path 
 CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it 
@@ -25,12 +25,12 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --tensor-model-parallel-size $TENSOR_MP_SIZE \
        --pipeline-model-parallel-size $PIPELINE_MP_SIZE \
        --num-layers 32 \
-       --hidden-size 1024 \
+       --hidden-size 2048 \
        --num-attention-heads 32 \
-       --micro-batch-size 64 \
-       --global-batch-size 512 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
+       --micro-batch-size 16 \
+       --global-batch-size 128 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
        --train-samples 100 \
        --save $CHECKPOINT_PATH \
        --load $CHECKPOINT_PATH \