ft-config.yaml 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # Config for multi-device full finetuning in full_finetune_distributed.py
  2. # using a Llama3.1 70B Instruct model
  3. #
  4. # This config assumes that you've run the following command before launching
  5. # this run:
  6. # tune download meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir /tmp/Meta-Llama-3.1-70B-Instruct --ignore-patterns "original/consolidated*"
  7. #
  8. # To launch on 8 devices, run the following command from root:
  9. # tune run --nproc_per_node 8 tune run --nproc_per_node 8 full_finetune_distributed --config ft-config.yaml
  10. output_dir: /tmp/torchtune/llama3_1_70B/full # /tmp may be deleted by your system. Change it to your preference.
  11. seed: 69
  12. shuffle: True
  13. # Parallelism
  14. tensor_parallel_dim: 1
  15. tensor_parallel_plan:
  16. _component_: torchtune.models.llama3.base_llama_tp_plan
  17. # Tokenizer
  18. tokenizer:
  19. _component_: torchtune.models.llama3.llama3_tokenizer
  20. path: /tmp/Meta-Llama-3.1-70B-Instruct/original/tokenizer.model
  21. max_seq_len: 16384
  22. dataset:
  23. _component_: toolcall.custom_dataset
  24. #data_files: "train_data.json"
  25. #split: "train"
  26. # Model Arguments
  27. model:
  28. _component_: torchtune.models.llama3_1.llama3_1_70b
  29. checkpointer:
  30. _component_: torchtune.training.FullModelHFCheckpointer
  31. checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
  32. checkpoint_files:
  33. filename_format: model-{}-of-{}.safetensors
  34. max_filename: "00030"
  35. recipe_checkpoint: null
  36. output_dir: ${output_dir}
  37. model_type: LLAMA3
  38. resume_from_checkpoint: False
  39. # Fine-tuning arguments
  40. batch_size: 4
  41. epochs: 30
  42. save_every_epochs: 10
  43. max_steps_per_epoch: null
  44. lr_scheduler:
  45. _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
  46. num_warmup_steps: 10
  47. optimizer:
  48. _component_: torch.optim.AdamW
  49. lr: 2e-5
  50. # Note: highly recommended to use fused=True optimizer flag
  51. # with CPU offload for faster optimizer step.
  52. fused: False
  53. loss:
  54. _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
  55. gradient_accumulation_steps: 1 # Use to increase effective batch size
  56. # Training env
  57. device: cuda
  58. # Memory management
  59. enable_activation_checkpointing: True # True reduces memory
  60. enable_activation_offloading: False # True reduces memory
  61. #custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
  62. fsdp_cpu_offload: False
  63. clip_grad_norm: null
  64. compile: False # torch.compile the model + loss, True increases speed + decreases memory
  65. optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
  66. # Reduced precision
  67. dtype: bf16
  68. # Logging
  69. metric_logger:
  70. _component_: torchtune.training.metric_logging.WandBLogger
  71. project: ctt
  72. log_dir: ${output_dir}/logs
  73. log_every_n_steps: 1
  74. log_peak_memory_stats: True
  75. # Profiler (disabled)
  76. profiler:
  77. _component_: torchtune.training.setup_torch_profiler
  78. enabled: False
  79. #Output directory of trace artifacts
  80. output_dir: ${output_dir}/profiling_outputs
  81. #`torch.profiler.ProfilerActivity` types to trace
  82. cpu: True
  83. cuda: True
  84. #trace options passed to `torch.profiler.profile`
  85. profile_memory: False
  86. with_stack: False
  87. record_shapes: True
  88. with_flops: False
  89. # `torch.profiler.schedule` options:
  90. # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
  91. wait_steps: 5
  92. warmup_steps: 3
  93. active_steps: 2
  94. num_cycles: 1