profile_2nd_run.sh 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  2. GPUS_PER_NODE=2 # <--- remember to change the number of GPUs you actually have in your system
  3. # Change for multinode config
  4. MASTER_ADDR=localhost
  5. MASTER_PORT=6000
  6. NNODES=1 #<-- currently we are using 1 node multigpus
  7. NODE_RANK=0
  8. WORLD_SIZE=2 # <--- remember to change the number of GPUs you actually have in your system
  9. ### modify this section to point the file to its own path
  10. CHECKPOINT_PATH='../sv_ckpt/' ## modify this path if you customize it
  11. DATA_PATH='../dataset/EN/NVblog_text_document' ## modify this path if you customize it
  12. VOCAB_FILE='../dataset/EN/50k/gpt2-vocab.json' ## modify this path if you customize it
  13. MERGE_FILE='../dataset/EN/50k/gpt2-merges.txt' ## modify this path if you customize it
  14. PROFILE_OUTPUT_PATH='../profiles/2ndrun/nsys_improved' # modify this to your own profile path
  15. ################ Beginning of modifiable section ####################
  16. TENSOR_MP_SIZE=2
  17. PIPELINE_MP_SIZE=1
  18. NUM_LYS=32
  19. HIDDEN_SIZE=2048
  20. NUM_ATTN_HEADS=32
  21. SEQ_LEN=1024
  22. MAX_POS_EM=1024
  23. MICRO_BZ=16
  24. GLOBAL_BZ=128
  25. ############## end of modifiable sectio, do NOT modify anything below this line ####################
  26. export OMP_NUM_THREADS=1
  27. DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
  28. ## for nsys run
  29. nsys profile --stats=false --force-overwrite=true --duration=300 --trace=cudnn,cuda,osrt,nvtx -o $PROFILE_OUTPUT_PATH \
  30. python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  31. ./Megatron-LM/Dlprof_pretrain_gpt.py \
  32. --tensor-model-parallel-size ${TENSOR_MP_SIZE} \
  33. --pipeline-model-parallel-size ${PIPELINE_MP_SIZE} \
  34. --num-layers ${NUM_LYS} \
  35. --hidden-size ${HIDDEN_SIZE} \
  36. --num-attention-heads ${NUM_ATTN_HEADS} \
  37. --micro-batch-size ${MICRO_BZ} \
  38. --global-batch-size ${GLOBAL_BZ} \
  39. --seq-length ${SEQ_LEN} \
  40. --max-position-embeddings ${MAX_POS_EM} \
  41. --train-samples 100 \
  42. --save ${CHECKPOINT_PATH} \
  43. --load ${CHECKPOINT_PATH} \
  44. --data-path ${DATA_PATH} \
  45. --vocab-file ${VOCAB_FILE} \
  46. --merge-file ${MERGE_FILE} \
  47. --data-impl mmap \
  48. --split 949,50,1 \
  49. --distributed-backend nccl \
  50. --lr 0.00015 \
  51. --lr-decay-style cosine \
  52. --min-lr 1.0e-5 \
  53. --weight-decay 1e-2 \
  54. --clip-grad 1.0 \
  55. --lr-warmup-fraction .01 \
  56. --checkpoint-activations \
  57. --log-interval 10 \
  58. --save-interval 100 \
  59. --eval-interval 200 \
  60. --eval-iters 10 \
  61. --fp16