#!/bin/bash # Copyright (c) 2020 NVIDIA Corporation. All rights reserved. #SBATCH -t 12:00:00 #SBATCH -A berzelius-2021-43 ### Note: --gres=gpu:x should equal to ntasks-per-node #SBATCH -N 2 #SBATCH --gres=gpu:8 ### ----------------- modify and in the section below ----------------- #SBATCH --output=//proj/guest_at_nsc/users//output/multinode_template_%x_%j_$DATETIME.log DIR='/proj//users//' DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` CHECKPOINT_PATH=$DIR/output/sv_gpt3_ckpt/ VOCAB_FILE=$DIR/dataset/vocab.json MERGE_FILE=$DIR/dataset/merges.txt DATA_PATH=$DIR/dataset/SVGPT_32k_text_document NHIDDEN= NLAYERS= NHEADS= SEQ_LEN= MAX_POS_EM= ### ----------------- end of section : do NOT modify anything else ----------------- VOCAB_SIZE=32000 MODEL_SIZE=$((($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ) / 10**9)) EXACT_MODEL_SIZE=$(($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) )) ### get the first node name as master address - customized for vgg slurm ### e.g. master(gnodee[2-5],gnoded1) == gnodee2 echo "NODELIST="${SLURM_NODELIST} echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK export MASTER_PORT=12340 export WORLD_SIZE=16 # The first hostname is the master address #master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) #master_addr=`perl -le '$_=$ENV{"SLURM_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` #master_addr=$(ip address show eth1 | grep -E '\' | cut -d' ' -f6 | cut -c-10) master_addr=MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` export MASTER_ADDR=$master_addr echo "master_addr="$master_addr echo "SLURM_NODEID="$SLURN_NODEID echo "hostname="$(hostname) echo "SLURM_PROCID="$SLURM_PROCID echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK echo "SLURM_LOCALID="$SLURM_LOCALID # Execute my Singularity image binding in the current working directory #export NCCL_DEBUG=INFO export NCCL_DEBUG_SUBSYS=ALL export OMP_NUM_THREADS=1 #export NCCL_IB_HCA="^mlx5_4,mlx5_5,mlx5_10,mlx5_11" export NCCL_NET=IB export NCCL_IB_HCA=${UCX_NET_DEVICES} echo $NCCL_IB_HCA export NODE_RANK=0 echo "NODE_RANK="$NODE_RANK #DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" DISTRIBUTED_ARGS="--nproc_per_node 8" options="--num-layers ${NLAYERS} \ --hidden-size ${NHIDDEN} \ --num-attention-heads ${NHEADS} \ --seq-length ${SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EM} \ --lr 0.00015 \ --train-iters 100 \ --min-lr 0.00001 \ --lr-decay-iters 99 \ --lr-warmup-fraction 0.01 \ --override-lr-scheduler \ --micro-batch-size 1 \ --global-batch-size 16 \ --vocab-file ${VOCAB_FILE} \ --merge-file ${MERGE_FILE} \ --split 949,50,1 \ --distributed-backend nccl \ --log-interval 10 \ --save-interval 100 \ --eval-interval 100 \ --eval-iters 10 \ --checkpoint-activations \ --tensor-model-parallel-size 8 \ --pipeline-model-parallel-size 2 \ --save ${CHECKPOINT_PATH} \ --load ${CHECKPOINT_PATH} \ --data-path ${DATA_PATH} \ --fp16 " # Execute my Singularity image binding in the current working directory cd /proj/guest_at_nsc/users/zcharpy/ export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/" echo "SLURN_JOBID="$SLURM_JOBID export jobid=SLURM_JOBID # containing the Python script I want to execute export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/" # containing the Python script I want to execute singularity exec --nv pytorch_21.03.sif python -m torch.distributed.launch ${DISTRIBUTED_ARGS} \ ${DIR}/Megatron-LM/pretrain_gpt.py \ ${options} echo $MODEL_SIZE echo $EXACT_MODEL_SIZE echo "if you see this line, this means that you have successfully ran Megatron-LM, congratulations !"