|
@@ -0,0 +1,107 @@
|
|
|
+#!/bin/bash
|
|
|
+# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
|
|
|
+
|
|
|
+#SBATCH -t 12:00:00
|
|
|
+#SBATCH -A berzelius-2021-43
|
|
|
+### Note: --gres=gpu:x should equal to ntasks-per-node
|
|
|
+#SBATCH -N 2
|
|
|
+#SBATCH --gres=gpu:8
|
|
|
+### ----------------- modify <UserName> and <FILL_IN> in the section below -----------------
|
|
|
+#SBATCH --output=//proj/guest_at_nsc/users/<UserName>/output/multinode_template_%x_%j_$DATETIME.log
|
|
|
+
|
|
|
+DIR='/proj/guest_at_nsc/users/<UserName>/'
|
|
|
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
|
|
|
+CHECKPOINT_PATH=$DIR/output/sv_gpt3_ckpt/
|
|
|
+VOCAB_FILE=$DIR/dataset/vocab.json
|
|
|
+MERGE_FILE=$DIR/dataset/merges.txt
|
|
|
+DATA_PATH=$DIR/dataset/SVGPT_32k_text_document
|
|
|
+
|
|
|
+NHIDDEN=<FILL_IN>
|
|
|
+NLAYERS=<FILL_IN>
|
|
|
+NHEADS=<FILL_IN>
|
|
|
+SEQ_LEN=<FILL_IN>
|
|
|
+MAX_POS_EM=<FILL_IN>
|
|
|
+### ----------------- end of section : do NOT modify anything else -----------------
|
|
|
+
|
|
|
+VOCAB_SIZE=32000
|
|
|
+MODEL_SIZE=$((($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ) / 10**9))
|
|
|
+EXACT_MODEL_SIZE=$(($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ))
|
|
|
+
|
|
|
+### get the first node name as master address - customized for vgg slurm
|
|
|
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
|
|
|
+echo "NODELIST="${SLURM_NODELIST}
|
|
|
+echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
|
|
|
+export MASTER_PORT=12340
|
|
|
+export WORLD_SIZE=16
|
|
|
+# The first hostname is the master address
|
|
|
+#master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
|
|
+#master_addr=`perl -le '$_=$ENV{"SLURM_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
|
|
|
+#master_addr=$(ip address show eth1 | grep -E '\<inet\>' | cut -d' ' -f6 | cut -c-10)
|
|
|
+master_addr=MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
|
|
|
+export MASTER_ADDR=$master_addr
|
|
|
+
|
|
|
+echo "master_addr="$master_addr
|
|
|
+echo "SLURM_NODEID="$SLURN_NODEID
|
|
|
+echo "hostname="$(hostname)
|
|
|
+echo "SLURM_PROCID="$SLURM_PROCID
|
|
|
+echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
|
|
|
+echo "SLURM_LOCALID="$SLURM_LOCALID
|
|
|
+
|
|
|
+# Execute my Singularity image binding in the current working directory
|
|
|
+#export NCCL_DEBUG=INFO
|
|
|
+export NCCL_DEBUG_SUBSYS=ALL
|
|
|
+export OMP_NUM_THREADS=1
|
|
|
+#export NCCL_IB_HCA="^mlx5_4,mlx5_5,mlx5_10,mlx5_11"
|
|
|
+export NCCL_NET=IB
|
|
|
+export NCCL_IB_HCA=${UCX_NET_DEVICES}
|
|
|
+echo $NCCL_IB_HCA
|
|
|
+export NODE_RANK=0
|
|
|
+echo "NODE_RANK="$NODE_RANK
|
|
|
+
|
|
|
+#DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
|
|
|
+DISTRIBUTED_ARGS="--nproc_per_node 8"
|
|
|
+
|
|
|
+options="--num-layers ${NLAYERS} \
|
|
|
+ --hidden-size ${NHIDDEN} \
|
|
|
+ --num-attention-heads ${NHEADS} \
|
|
|
+ --seq-length ${SEQ_LEN} \
|
|
|
+ --max-position-embeddings ${MAX_POS_EM} \
|
|
|
+ --lr 0.00015 \
|
|
|
+ --train-iters 100 \
|
|
|
+ --min-lr 0.00001 \
|
|
|
+ --lr-decay-iters 99 \
|
|
|
+ --lr-warmup-fraction 0.01 \
|
|
|
+ --override-lr-scheduler \
|
|
|
+ --micro-batch-size 1 \
|
|
|
+ --global-batch-size 16 \
|
|
|
+ --vocab-file ${VOCAB_FILE} \
|
|
|
+ --merge-file ${MERGE_FILE} \
|
|
|
+ --split 949,50,1 \
|
|
|
+ --distributed-backend nccl \
|
|
|
+ --log-interval 10 \
|
|
|
+ --save-interval 100 \
|
|
|
+ --eval-interval 100 \
|
|
|
+ --eval-iters 10 \
|
|
|
+ --checkpoint-activations \
|
|
|
+ --tensor-model-parallel-size 8 \
|
|
|
+ --pipeline-model-parallel-size 2 \
|
|
|
+ --save ${CHECKPOINT_PATH} \
|
|
|
+ --load ${CHECKPOINT_PATH} \
|
|
|
+ --data-path ${DATA_PATH} \
|
|
|
+ --fp16 "
|
|
|
+# Execute my Singularity image binding in the current working directory
|
|
|
+cd /proj/guest_at_nsc/users/zcharpy/
|
|
|
+export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
|
|
|
+echo "SLURN_JOBID="$SLURM_JOBID
|
|
|
+export jobid=SLURM_JOBID
|
|
|
+# containing the Python script I want to execute
|
|
|
+
|
|
|
+export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
|
|
|
+# containing the Python script I want to execute
|
|
|
+singularity exec --nv pytorch_21.03.sif python -m torch.distributed.launch ${DISTRIBUTED_ARGS} \
|
|
|
+ ${DIR}/Megatron-LM/pretrain_gpt.py \
|
|
|
+ ${options}
|
|
|
+
|
|
|
+echo $MODEL_SIZE
|
|
|
+echo $EXACT_MODEL_SIZE
|
|
|
+echo "if you see this line, this means that you have successfully ran Megatron-LM, congratulations !"
|