Browse Source

copy pytorch sif file and create correct directory structure

zenodia 3 years ago
parent
commit
983b5cc182

+ 107 - 0
ai/Megatron/English/Python/source_code/Day1-runMegatron-LM_GPT_template.sh

@@ -0,0 +1,107 @@
+#!/bin/bash
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
+
+#SBATCH -t 12:00:00
+#SBATCH -A berzelius-2021-43
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH -N 2
+#SBATCH --gres=gpu:8
+###  -----------------  modify <UserName> and <FILL_IN> in the section below -----------------
+#SBATCH --output=//proj/guest_at_nsc/users/<UserName>/output/multinode_template_%x_%j_$DATETIME.log 
+
+DIR='/proj/guest_at_nsc/users/<UserName>/'
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+CHECKPOINT_PATH=$DIR/output/sv_gpt3_ckpt/
+VOCAB_FILE=$DIR/dataset/vocab.json
+MERGE_FILE=$DIR/dataset/merges.txt
+DATA_PATH=$DIR/dataset/SVGPT_32k_text_document
+
+NHIDDEN=<FILL_IN>
+NLAYERS=<FILL_IN>
+NHEADS=<FILL_IN>
+SEQ_LEN=<FILL_IN>
+MAX_POS_EM=<FILL_IN>
+### ----------------- end of section : do NOT modify anything else -----------------
+
+VOCAB_SIZE=32000
+MODEL_SIZE=$((($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ) / 10**9))
+EXACT_MODEL_SIZE=$(($NLAYERS * (12*$NHIDDEN**2 + 13*$NHIDDEN) + ($VOCAB_SIZE * $NHIDDEN) + ($SEQ_LEN * $NHIDDEN) ))
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+echo "NODELIST="${SLURM_NODELIST}
+echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
+export MASTER_PORT=12340
+export WORLD_SIZE=16
+# The first hostname is the master address
+#master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+#master_addr=`perl -le '$_=$ENV{"SLURM_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
+#master_addr=$(ip address show eth1 | grep -E '\<inet\>' | cut -d' ' -f6 | cut -c-10)
+master_addr=MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
+export MASTER_ADDR=$master_addr
+
+echo "master_addr="$master_addr
+echo "SLURM_NODEID="$SLURN_NODEID
+echo "hostname="$(hostname)
+echo "SLURM_PROCID="$SLURM_PROCID
+echo "LOCAL_PROCESS_RANK="$LOCAL_PROCESS_RANK
+echo "SLURM_LOCALID="$SLURM_LOCALID
+
+# Execute my Singularity image binding in the current working directory
+#export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=ALL
+export OMP_NUM_THREADS=1
+#export NCCL_IB_HCA="^mlx5_4,mlx5_5,mlx5_10,mlx5_11"
+export NCCL_NET=IB
+export NCCL_IB_HCA=${UCX_NET_DEVICES} 
+echo $NCCL_IB_HCA
+export NODE_RANK=0
+echo "NODE_RANK="$NODE_RANK
+
+#DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+DISTRIBUTED_ARGS="--nproc_per_node 8"
+
+options="--num-layers ${NLAYERS} \
+			--hidden-size ${NHIDDEN} \
+			--num-attention-heads ${NHEADS} \
+			--seq-length ${SEQ_LEN} \
+			--max-position-embeddings ${MAX_POS_EM} \
+			--lr 0.00015 \
+			--train-iters 100 \
+			--min-lr 0.00001 \
+			--lr-decay-iters 99 \
+			--lr-warmup-fraction 0.01 \
+			--override-lr-scheduler \
+			--micro-batch-size 1 \
+			--global-batch-size 16 \
+			--vocab-file ${VOCAB_FILE} \
+			--merge-file ${MERGE_FILE} \
+			--split 949,50,1 \
+			--distributed-backend nccl \
+			--log-interval 10 \
+			--save-interval 100 \
+			--eval-interval 100 \
+			--eval-iters 10 \
+			--checkpoint-activations \
+			--tensor-model-parallel-size 8 \
+			--pipeline-model-parallel-size 2 \
+			--save ${CHECKPOINT_PATH} \
+			--load ${CHECKPOINT_PATH} \
+			--data-path ${DATA_PATH} \
+			--fp16 "
+# Execute my Singularity image binding in the current working directory
+cd /proj/guest_at_nsc/users/zcharpy/
+export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
+echo "SLURN_JOBID="$SLURM_JOBID 
+export jobid=SLURM_JOBID 
+# containing the Python script I want to execute
+
+export SINGULARITY_BINDPATH="/proj/guest_at_nsc/users/zcharpy/"
+# containing the Python script I want to execute
+singularity exec --nv pytorch_21.03.sif python -m torch.distributed.launch ${DISTRIBUTED_ARGS} \
+	${DIR}/Megatron-LM/pretrain_gpt.py \
+	${options}
+	 
+echo $MODEL_SIZE
+echo $EXACT_MODEL_SIZE
+echo "if you see this line, this means that you have successfully ran Megatron-LM, congratulations !"

+ 53 - 0
ai/Megatron/English/Python/source_code/create_dir_and_download_pytorch_sif_file.sh

@@ -0,0 +1,53 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
+#!/usr/bin/env bash
+cp create_dir_and_download_pytorch_sif_file.sh ../../../../../../
+cd ../../../../../../
+mkdir output
+mkdir output/sv_gpt3_ckpt/
+mkdir dataset
+
+# gdrive_download pre-built **pytorch_21.03.sif** which is needed to run singularity 
+# script to download Google Drive files from command line
+# not guaranteed to work indefinitely
+# taken from Stack Overflow answer:
+# http://stackoverflow.com/a/38937732/7002068
+
+gURL=https://drive.google.com/file/d/18-QSZhPhNJS3m9ASTPkjnzsFVgg71MNx/view?usp=sharing
+# match more than 26 word characters
+ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}')
+
+ggURL='https://drive.google.com/uc?export=download'
+
+curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null
+getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)"
+
+cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"'
+echo -e "Downloading from "$gURL"...\n"
+eval $cmd
+
+# gdrive_download toy dataset 
+
+gURL=https://drive.google.com/file/d/17hIXwG6jHgijmBJKq2Z211Hm6AXfQo9C/view?usp=sharing
+# match more than 26 word characters
+ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}')
+
+ggURL='https://drive.google.com/uc?export=download'
+
+curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null
+getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)"
+
+cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"'
+echo -e "Downloading from "$gURL"...\n"
+eval $cmd
+
+
+### move toy data to correct dirs
+mv toydata.zip ./dataset/
+cd dataset/
+unzip -r toydata.zip
+cd ..
+ls ./dataset/
+
+### move the Megatron run script to the correct directory
+cp ./gpubootcamp/ai/Megatron/English/Python/source_code/Day1-runMegatron-LM_GPT_template.sh ./
+echo "done !"