model_utils.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the GNU General Public License version 3.
  3. from llama_recipes.utils.config_utils import update_config
  4. from llama_recipes.configs import quantization_config as QUANT_CONFIG
  5. from peft import PeftModel
  6. from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
  7. from warnings import warn
  8. # Function to load the main model for text generation
  9. def load_model(model_name, quantization, use_fast_kernels, **kwargs):
  10. if type(quantization) == type(True):
  11. warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning)
  12. quantization = "8bit"
  13. bnb_config = None
  14. if quantization:
  15. quant_config = QUANT_CONFIG()
  16. update_config(quant_config, **kwargs)
  17. bnb_config = quant_config.create_bnb_config(quantization)
  18. print(f"use_fast_kernels{use_fast_kernels}")
  19. kwargs = {}
  20. if bnb_config:
  21. kwargs["quantization_config"]=bnb_config
  22. kwargs["device_map"]="auto"
  23. kwargs["low_cpu_mem_usage"]=True
  24. kwargs["attn_implementation"]="sdpa" if use_fast_kernels else None
  25. model = AutoModelForCausalLM.from_pretrained(
  26. model_name,
  27. return_dict=True,
  28. **kwargs,
  29. )
  30. return model
  31. # Function to load the PeftModel for performance optimization
  32. def load_peft_model(model, peft_model):
  33. peft_model = PeftModel.from_pretrained(model, peft_model)
  34. return peft_model
  35. # Loading the model from config to load FSDP checkpoints into that
  36. def load_llama_from_config(config_path):
  37. model_config = LlamaConfig.from_pretrained(config_path)
  38. model = LlamaForCausalLM(config=model_config)
  39. return model