finetune_race_distributed.sh 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. #!/bin/bash
  2. WORLD_SIZE=8
  3. DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
  4. --nnodes 1 \
  5. --node_rank 0 \
  6. --master_addr localhost \
  7. --master_port 6000"
  8. TRAIN_DATA="data/RACE/train/middle"
  9. VALID_DATA="data/RACE/dev/middle \
  10. data/RACE/dev/high"
  11. VOCAB_FILE=bert-vocab.txt
  12. PRETRAINED_CHECKPOINT=checkpoints/bert_345m
  13. CHECKPOINT_PATH=checkpoints/bert_345m_race
  14. python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
  15. --task RACE \
  16. --seed 1234 \
  17. --train-data $TRAIN_DATA \
  18. --valid-data $VALID_DATA \
  19. --tokenizer-type BertWordPieceLowerCase \
  20. --vocab-file $VOCAB_FILE \
  21. --epochs 3 \
  22. --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
  23. --tensor-model-parallel-size 1 \
  24. --num-layers 24 \
  25. --hidden-size 1024 \
  26. --num-attention-heads 16 \
  27. --micro-batch-size 4 \
  28. --checkpoint-activations \
  29. --lr 1.0e-5 \
  30. --lr-decay-style linear \
  31. --lr-warmup-fraction 0.06 \
  32. --seq-length 512 \
  33. --max-position-embeddings 512 \
  34. --save-interval 100000 \
  35. --save $CHECKPOINT_PATH \
  36. --log-interval 10 \
  37. --eval-interval 100 \
  38. --eval-iters 50 \
  39. --weight-decay 1.0e-1 \
  40. --clip-grad 1.0 \
  41. --hidden-dropout 0.1 \
  42. --attention-dropout 0.1 \
  43. --fp16