finetuning.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import os
  4. from pkg_resources import packaging
  5. import fire
  6. import random
  7. import torch
  8. import torch.optim as optim
  9. from peft import get_peft_model, prepare_model_for_int8_training
  10. from torch.distributed.fsdp import (
  11. FullyShardedDataParallel as FSDP,
  12. ShardingStrategy
  13. )
  14. from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
  15. from torch.optim.lr_scheduler import StepLR
  16. from transformers import (
  17. LlamaForCausalLM,
  18. LlamaTokenizer,
  19. LlamaConfig,
  20. )
  21. from transformers.models.llama.modeling_llama import LlamaDecoderLayer
  22. from llama_recipes.configs import fsdp_config as FSDP_CONFIG
  23. from llama_recipes.configs import train_config as TRAIN_CONFIG
  24. from llama_recipes.data.concatenator import ConcatDataset
  25. from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing
  26. from llama_recipes.utils import fsdp_auto_wrap_policy
  27. from llama_recipes.utils.config_utils import (
  28. update_config,
  29. generate_peft_config,
  30. generate_dataset_config,
  31. get_dataloader_kwargs,
  32. )
  33. from llama_recipes.utils.dataset_utils import get_preprocessed_dataset
  34. from llama_recipes.utils.fsdp_utils import hsdp_device_mesh
  35. from llama_recipes.utils.train_utils import (
  36. train,
  37. freeze_transformer_layers,
  38. setup,
  39. setup_environ_flags,
  40. clear_gpu_cache,
  41. print_model_size,
  42. get_policies,
  43. )
  44. from accelerate.utils import is_xpu_available
  45. def main(**kwargs):
  46. # Update the configuration for the training and sharding process
  47. train_config, fsdp_config = TRAIN_CONFIG(), FSDP_CONFIG()
  48. update_config((train_config, fsdp_config), **kwargs)
  49. # Set the seeds for reproducibility
  50. if is_xpu_available():
  51. torch.xpu.manual_seed(train_config.seed)
  52. torch.manual_seed(train_config.seed)
  53. random.seed(train_config.seed)
  54. if train_config.enable_fsdp:
  55. setup()
  56. # torchrun specific
  57. local_rank = int(os.environ["LOCAL_RANK"])
  58. rank = int(os.environ["RANK"])
  59. world_size = int(os.environ["WORLD_SIZE"])
  60. if torch.distributed.is_initialized():
  61. if is_xpu_available():
  62. torch.xpu.set_device(local_rank)
  63. elif torch.cuda.is_available():
  64. torch.cuda.set_device(local_rank)
  65. clear_gpu_cache(local_rank)
  66. setup_environ_flags(rank)
  67. # Load the pre-trained model and setup its configuration
  68. use_cache = False if train_config.enable_fsdp else None
  69. if train_config.enable_fsdp and train_config.low_cpu_fsdp:
  70. """
  71. for FSDP, we can save cpu memory by loading pretrained model on rank0 only.
  72. this avoids cpu oom when loading large models like llama 70B, in which case
  73. model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some comms
  74. overhead and currently requires latest nightly.
  75. """
  76. v = packaging.version.parse(torch.__version__)
  77. verify_latest_nightly = v.is_devrelease and v.dev >= 20230701
  78. if not verify_latest_nightly:
  79. raise Exception("latest pytorch nightly build is required to run with low_cpu_fsdp config, "
  80. "please install latest nightly.")
  81. if rank == 0:
  82. model = LlamaForCausalLM.from_pretrained(
  83. train_config.model_name,
  84. load_in_8bit=True if train_config.quantization else None,
  85. device_map="auto" if train_config.quantization else None,
  86. use_cache=use_cache,
  87. attn_implementation="sdpa" if train_config.use_fast_kernels else None,
  88. )
  89. else:
  90. llama_config = LlamaConfig.from_pretrained(train_config.model_name)
  91. llama_config.use_cache = use_cache
  92. with torch.device("meta"):
  93. model = LlamaForCausalLM(llama_config)
  94. else:
  95. model = LlamaForCausalLM.from_pretrained(
  96. train_config.model_name,
  97. load_in_8bit=True if train_config.quantization else None,
  98. device_map="auto" if train_config.quantization else None,
  99. use_cache=use_cache,
  100. attn_implementation="sdpa" if train_config.use_fast_kernels else None,
  101. )
  102. # Load the tokenizer and add special tokens
  103. tokenizer = LlamaTokenizer.from_pretrained(train_config.model_name)
  104. tokenizer.pad_token_id = tokenizer.eos_token_id
  105. print_model_size(model, train_config, rank if train_config.enable_fsdp else 0)
  106. # Prepare the model for int8 training if quantization is enabled
  107. if train_config.quantization:
  108. model = prepare_model_for_int8_training(model)
  109. # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled
  110. if train_config.enable_fsdp and fsdp_config.pure_bf16:
  111. model.to(torch.bfloat16)
  112. if train_config.use_peft:
  113. peft_config = generate_peft_config(train_config, kwargs)
  114. model = get_peft_model(model, peft_config)
  115. model.print_trainable_parameters()
  116. hsdp_device_mesh = None
  117. if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD:
  118. hsdp_device_mesh = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
  119. print("HSDP device mesh is ready")
  120. #setting up FSDP if enable_fsdp is enabled
  121. if train_config.enable_fsdp:
  122. if not train_config.use_peft and train_config.freeze_layers:
  123. freeze_transformer_layers(train_config.num_freeze_layers)
  124. mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
  125. my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, LlamaDecoderLayer)
  126. device_id = 0
  127. if is_xpu_available():
  128. device_id = torch.xpu.current_device()
  129. elif torch.cuda.is_available():
  130. device_id = torch.cuda.current_device()
  131. model = FSDP(
  132. model,
  133. auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy,
  134. cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
  135. mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
  136. sharding_strategy=fsdp_config.sharding_strategy,
  137. device_mesh=hsdp_device_mesh,
  138. device_id=device_id,
  139. limit_all_gathers=True,
  140. sync_module_states=train_config.low_cpu_fsdp,
  141. param_init_fn=lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)
  142. if train_config.low_cpu_fsdp and rank != 0 else None,
  143. )
  144. if fsdp_config.fsdp_activation_checkpointing:
  145. apply_fsdp_checkpointing(model)
  146. elif not train_config.quantization and not train_config.enable_fsdp:
  147. if is_xpu_available():
  148. model.to("xpu:0")
  149. elif torch.cuda.is_available():
  150. model.to("cuda")
  151. dataset_config = generate_dataset_config(train_config, kwargs)
  152. # Load and preprocess the dataset for training and validation
  153. dataset_train = get_preprocessed_dataset(
  154. tokenizer,
  155. dataset_config,
  156. split="train",
  157. )
  158. if not train_config.enable_fsdp or rank == 0:
  159. print(f"--> Training Set Length = {len(dataset_train)}")
  160. dataset_val = get_preprocessed_dataset(
  161. tokenizer,
  162. dataset_config,
  163. split="test",
  164. )
  165. if not train_config.enable_fsdp or rank == 0:
  166. print(f"--> Validation Set Length = {len(dataset_val)}")
  167. if train_config.batching_strategy == "packing":
  168. dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
  169. train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, tokenizer, "train")
  170. # Create DataLoaders for the training and validation dataset
  171. train_dataloader = torch.utils.data.DataLoader(
  172. dataset_train,
  173. num_workers=train_config.num_workers_dataloader,
  174. pin_memory=True,
  175. **train_dl_kwargs,
  176. )
  177. eval_dataloader = None
  178. if train_config.run_validation:
  179. if train_config.batching_strategy == "packing":
  180. dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
  181. val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, tokenizer, "val")
  182. eval_dataloader = torch.utils.data.DataLoader(
  183. dataset_val,
  184. num_workers=train_config.num_workers_dataloader,
  185. pin_memory=True,
  186. **val_dl_kwargs,
  187. )
  188. # Initialize the optimizer and learning rate scheduler
  189. if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision":
  190. optimizer = AnyPrecisionAdamW(
  191. model.parameters(),
  192. lr=train_config.lr,
  193. momentum_dtype=torch.bfloat16,
  194. variance_dtype=torch.bfloat16,
  195. use_kahan_summation=False,
  196. weight_decay=train_config.weight_decay,
  197. )
  198. else:
  199. optimizer = optim.AdamW(
  200. model.parameters(),
  201. lr=train_config.lr,
  202. weight_decay=train_config.weight_decay,
  203. )
  204. scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
  205. # Start the training process
  206. results = train(
  207. model,
  208. train_dataloader,
  209. eval_dataloader,
  210. tokenizer,
  211. optimizer,
  212. scheduler,
  213. train_config.gradient_accumulation_steps,
  214. train_config,
  215. fsdp_config if train_config.enable_fsdp else None,
  216. local_rank if train_config.enable_fsdp else None,
  217. rank if train_config.enable_fsdp else None,
  218. )
  219. if not train_config.enable_fsdp or rank==0:
  220. [print(f'Key: {k}, Value: {v}') for k, v in results.items()]
  221. if __name__ == "__main__":
  222. fire.Fire(main)