pretrain_gpt3_175B.sh 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. #!/bin/bash
  2. #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
  3. DIR=`pwd`
  4. DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
  5. mkdir -p $DIR/logs
  6. DATASET_1="<PATH TO THE FIRST DATASET>"
  7. DATASET_2="<PATH TO THE SECOND DATASET>"
  8. DATASET_3="<PATH TO THE THIRD DATASET>"
  9. DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
  10. options=" \
  11. --tensor-model-parallel-size 8 \
  12. --pipeline-model-parallel-size 16 \
  13. --num-layers 96 \
  14. --hidden-size 12288 \
  15. --num-attention-heads 96 \
  16. --seq-length 2048 \
  17. --max-position-embeddings 2048 \
  18. --micro-batch-size 1 \
  19. --global-batch-size 1536 \
  20. --rampup-batch-size 16 16 5859375 \
  21. --train-samples 146484375 \
  22. --lr-decay-samples 126953125 \
  23. --lr-warmup-samples 183105 \
  24. --lr 6.0e-5 \
  25. --min-lr 6.0e-6 \
  26. --lr-decay-style cosine \
  27. --log-interval 10 \
  28. --eval-iters 40 \
  29. --eval-interval 1000 \
  30. --data-path ${DATASET} \
  31. --vocab-file <PATH TO gpt-vocab.json> \
  32. --merge-file <PATH TO gpt-merges.txt> \
  33. --save-interval 1000 \
  34. --save <PATH TO CHECKPOINTS DIRECTORY> \
  35. --load <PATH TO CHECKPOINTS DIRECTORY> \
  36. --split 98,2,0 \
  37. --clip-grad 1.0 \
  38. --weight-decay 0.1 \
  39. --adam-beta1 0.9 \
  40. --adam-beta2 0.95 \
  41. --init-method-std 0.006 \
  42. --tensorboard-dir <TENSORBOARD DIRECTORY> \
  43. --fp16 \
  44. --checkpoint-activations "
  45. run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
  46. srun -l \
  47. --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
  48. --container-mounts "<DIRECTORIES TO MOUNT>" \
  49. --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
  50. set +x