MultiNodes_runMegatron-LM_GPT_template.sh 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/bin/bash
  2. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  3. #SBATCH -t 12:00:00
  4. #SBATCH -A berzelius-2021-43
  5. ### Note: --gres=gpu:x should equal to ntasks-per-node
  6. #SBATCH -N 2
  7. #SBATCH --gres=gpu:8
  8. ### ----------------- modify <UserName> and <FILL_IN> in the section below -----------------
  9. #SBATCH --output=//proj/guest_at_nsc/users/<UserName>/output/multinode_template_%x_%j_$DATETIME.log
  10. DIR='/proj/<BootCamp_DIR>/users/<UserName>/'
  11. DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
  12. CHECKPOINT_PATH=$DIR/output/sv_gpt3_ckpt/
  13. VOCAB_FILE=$DIR/dataset/vocab.json
  14. MERGE_FILE=$DIR/dataset/merges.txt
  15. DATA_PATH=$DIR/dataset/SVGPT_32k_text_document
  16. NHIDDEN=<FILL_IN>
  17. NLAYERS=<FILL_IN>
  18. NHEADS=<FILL_IN>
  19. SEQ_LEN=<FILL_IN>
  20. MAX_POS_EM=<FILL_IN>
  21. ### ----------------- end of section : do NOT modify anything else -----------------
  22. VOCAB_SIZE=32000
  23. MODEL_SIZE=$((($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ) / 10**9))
  24. EXACT_MODEL_SIZE=$(($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ))
  25. ### get the first node name as master address - customized for vgg slurm
  26. ### e.g. master(gnodee[2-5],gnoded1) == gnodee2
  27. echo "NODELIST="${SLURM_NODELIST}
  28. echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
  29. export MASTER_PORT=12340
  30. export WORLD_SIZE=16
  31. # The first hostname is the master address
  32. #master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
  33. #master_addr=`perl -le '$_=$ENV{"SLURM_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
  34. #master_addr=$(ip address show eth1 | grep -E '\<inet\>' | cut -d' ' -f6 | cut -c-10)
  35. master_addr=MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
  36. export MASTER_ADDR=$master_addr
  37. echo "master_addr="$master_addr
  38. echo "SLURM_NODEID="$SLURN_NODEID
  39. echo "hostname="$(hostname)
  40. echo "SLURM_PROCID="$SLURM_PROCID
  41. echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
  42. echo "SLURM_LOCALID="$SLURM_LOCALID
  43. # Execute my Singularity image binding in the current working directory
  44. #export NCCL_DEBUG=INFO
  45. export NCCL_DEBUG_SUBSYS=ALL
  46. export OMP_NUM_THREADS=1
  47. #export NCCL_IB_HCA="^mlx5_4,mlx5_5,mlx5_10,mlx5_11"
  48. export NCCL_NET=IB
  49. export NCCL_IB_HCA=${UCX_NET_DEVICES}
  50. echo $NCCL_IB_HCA
  51. export NODE_RANK=0
  52. echo "NODE_RANK="$NODE_RANK
  53. #DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
  54. DISTRIBUTED_ARGS="--nproc_per_node 8"
  55. options="--num-layers ${NLAYERS} \
  56. --hidden-size ${NHIDDEN} \
  57. --num-attention-heads ${NHEADS} \
  58. --seq-length ${SEQ_LEN} \
  59. --max-position-embeddings ${MAX_POS_EM} \
  60. --lr 0.00015 \
  61. --train-iters 100 \
  62. --min-lr 0.00001 \
  63. --lr-decay-iters 99 \
  64. --lr-warmup-fraction 0.01 \
  65. --override-lr-scheduler \
  66. --micro-batch-size 1 \
  67. --global-batch-size 16 \
  68. --vocab-file ${VOCAB_FILE} \
  69. --merge-file ${MERGE_FILE} \
  70. --split 949,50,1 \
  71. --distributed-backend nccl \
  72. --log-interval 10 \
  73. --save-interval 100 \
  74. --eval-interval 100 \
  75. --eval-iters 10 \
  76. --checkpoint-activations \
  77. --tensor-model-parallel-size 8 \
  78. --pipeline-model-parallel-size 2 \
  79. --save ${CHECKPOINT_PATH} \
  80. --load ${CHECKPOINT_PATH} \
  81. --data-path ${DATA_PATH} \
  82. --fp16 "
  83. # Execute my Singularity image binding in the current working directory
  84. cd /proj/guest_at_nsc/users/zcharpy/
  85. export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
  86. echo "SLURN_JOBID="$SLURM_JOBID
  87. export jobid=SLURM_JOBID
  88. # containing the Python script I want to execute
  89. export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
  90. # containing the Python script I want to execute
  91. singularity exec --nv pytorch_21.03.sif python -m torch.distributed.launch ${DISTRIBUTED_ARGS} \
  92. ${DIR}/Megatron-LM/pretrain_gpt.py \
  93. ${options}
  94. echo $MODEL_SIZE
  95. echo $EXACT_MODEL_SIZE
  96. echo "if you see this line, this means that you have successfully ran Megatron-LM, congratulations !"