| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 | # Copyright (c) Meta Platforms, Inc. and affiliates.# This software may be used and distributed according to the terms of the GNU General Public License version 3.from llama_recipes.utils.config_utils import update_configfrom llama_recipes.configs import quantization_config  as QUANT_CONFIGfrom peft import PeftModelfrom transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfigfrom warnings import warn# Function to load the main model for text generationdef load_model(model_name, quantization, use_fast_kernels, **kwargs):    if type(quantization) == type(True):            warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning)            quantization = "8bit"    bnb_config = None    if quantization:        quant_config = QUANT_CONFIG()        update_config(quant_config, **kwargs)        bnb_config = quant_config.create_bnb_config(quantization)    print(f"use_fast_kernels{use_fast_kernels}")    kwargs = {}    if bnb_config:        kwargs["quantization_config"]=bnb_config    kwargs["device_map"]="auto"    kwargs["low_cpu_mem_usage"]=True    kwargs["attn_implementation"]="sdpa" if use_fast_kernels else None    model = AutoModelForCausalLM.from_pretrained(        model_name,        return_dict=True,        **kwargs,    )    return model# Function to load the PeftModel for performance optimizationdef load_peft_model(model, peft_model):    peft_model = PeftModel.from_pretrained(model, peft_model)    return peft_model# Loading the model from config to load FSDP checkpoints into thatdef load_llama_from_config(config_path):    model_config = LlamaConfig.from_pretrained(config_path)     model = LlamaForCausalLM(config=model_config)    return model        
 |