瀏覽代碼

Save the `preprocessor_config.json` and `chat_template.json` for mllama model after conversion (#741)

Kai Wu 6 月之前
父節點
當前提交
d8b0eba79b

+ 1 - 0
.github/scripts/spellcheck_conf/wordlist.txt

@@ -1484,3 +1484,4 @@ uv
 8xL40S
 xL
 EDA
+DeepLearningai

+ 2 - 2
recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md

@@ -2,10 +2,10 @@
 
 The folder here containts the Llama 3 ported notebooks of the DLAI short course [Building Agentic RAG with Llamaindex](https://www.deeplearning.ai/short-courses/building-agentic-rag-with-llamaindex/).
 
-1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder.
+1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder.
 
 2. [Building Agentic RAG with Llamaindex L2 Tool Calling](Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb) shows how to use Llama 3 to not only pick a function to execute, but also infer an argument to pass through the function.
 
 3. [Building Agentic RAG with Llamaindex L3 Building an Agent Reasoning Loop](Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb) shows how to define a complete agent reasoning loop to reason over tools and multiple steps on a complex question the user asks about a single document while maintaining memory.
 
-3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity.
+3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity.

+ 2 - 2
recipes/experimental/long_context/H2O/README.md

@@ -8,7 +8,7 @@ Besides, LLMs usually have poor generation to long sequence during inference. H2
 
 Current implementation supports llama-1/2/3, from 7B to 70B. Since H2O only maintains the most important KV pairs, it might missing some important information in the middle content for some knowlege-intensive tasks.
 
-More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; Blog: **https://allenz.work/?p=11**.
+More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**;
 
 **Note: this implementation is tested with transformers == 4.39.0**
 
@@ -21,7 +21,7 @@ python run_summarization.py \
 --input-path data/summarization/xsum.jsonl \
 --output-path summarization_output/xsum_h2o.jsonl \
 --model-name meta-llama/Meta-Llama-3-8B \
---enable_h2o_generation 
+--enable_h2o_generation
 ```
 
 ##### **Results**

+ 157 - 78
src/llama_recipes/finetuning.py

@@ -1,61 +1,68 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-from collections import Counter
+import dataclasses
 import os
+import random
+from collections import Counter
+from warnings import warn
 
-import dataclasses
 import fire
-import random
+import numpy as np
 import torch
 import torch.optim as optim
-import numpy as np
-from peft import get_peft_model, PeftModel
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel as FSDP,
-    ShardingStrategy
-)
-from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
-from torch.optim.lr_scheduler import StepLR
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    AutoProcessor, 
-    LlamaForCausalLM,
-    MllamaForConditionalGeneration,
-)
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-from transformers.models.mllama.modeling_mllama import  MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
+from accelerate.utils import is_xpu_available
 
-from llama_recipes.configs import fsdp_config as FSDP_CONFIG
-from llama_recipes.configs import train_config as TRAIN_CONFIG
-from llama_recipes.configs import quantization_config  as QUANTIZATION_CONFIG
+from llama_recipes.configs import (
+    fsdp_config as FSDP_CONFIG,
+    quantization_config as QUANTIZATION_CONFIG,
+    train_config as TRAIN_CONFIG,
+)
 from llama_recipes.data.concatenator import ConcatDataset
 from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing
 
 from llama_recipes.utils import fsdp_auto_wrap_policy
 from llama_recipes.utils.config_utils import (
-    update_config,
-    generate_peft_config,
+    check_fsdp_config,
     generate_dataset_config,
+    generate_peft_config,
     get_dataloader_kwargs,
-    check_fsdp_config,
+    update_config,
+)
+from llama_recipes.utils.dataset_utils import (
+    get_custom_data_collator,
+    get_preprocessed_dataset,
 )
-from llama_recipes.utils.dataset_utils import get_preprocessed_dataset,get_custom_data_collator
 
 from llama_recipes.utils.fsdp_utils import hsdp_device_mesh
 from llama_recipes.utils.train_utils import (
-    train,
+    clear_gpu_cache,
     freeze_transformer_layers,
+    get_policies,
+    print_model_size,
     setup,
     setup_environ_flags,
-    clear_gpu_cache,
-    print_model_size,
-    get_policies,
+    train,
+)
+from peft import get_peft_model, PeftModel
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
+from torch.optim.lr_scheduler import StepLR
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    LlamaForCausalLM,
+    MllamaForConditionalGeneration,
+)
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.mllama.modeling_mllama import (
+    MllamaCrossAttentionDecoderLayer,
+    MllamaSelfAttentionDecoderLayer,
+    MllamaVisionEncoderLayer,
 )
-from accelerate.utils import is_xpu_available
-from warnings import warn
+
 
 def setup_wandb(train_config, fsdp_config, **kwargs):
     try:
@@ -66,6 +73,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs):
             "Please install it using pip install wandb"
         )
     from llama_recipes.configs import wandb_config as WANDB_CONFIG
+
     wandb_config = WANDB_CONFIG()
     update_config(wandb_config, **kwargs)
     init_dict = dataclasses.asdict(wandb_config)
@@ -74,6 +82,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs):
     run.config.update(fsdp_config, allow_val_change=True)
     return run
 
+
 def main(**kwargs):
     # Update the configuration for the training and sharding process
     train_config, fsdp_config = TRAIN_CONFIG(), FSDP_CONFIG()
@@ -103,18 +112,23 @@ def main(**kwargs):
     wandb_run = None
 
     if train_config.use_wandb:
-        if not train_config.enable_fsdp or rank==0:
+        if not train_config.enable_fsdp or rank == 0:
             wandb_run = setup_wandb(train_config, fsdp_config, **kwargs)
-    
-    #setting quantization configs
+
+    # setting quantization configs
     bnb_config = None
     if train_config.quantization:
         if type(train_config.quantization) == type(True):
-            warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning)
+            warn(
+                "Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.",
+                FutureWarning,
+            )
             train_config.quantization = "8bit"
 
         if train_config.quantization == "8bit" and train_config.enable_fsdp:
-            raise ValueError("8bit quantization is not supported with FSDP, please use 4bit quantization")
+            raise ValueError(
+                "8bit quantization is not supported with FSDP, please use 4bit quantization"
+            )
 
         quant_config = QUANTIZATION_CONFIG()
         update_config(quant_config, **kwargs)
@@ -126,14 +140,22 @@ def main(**kwargs):
     if config.model_type == "mllama":
         is_vision = True
         model = MllamaForConditionalGeneration.from_pretrained(
-        train_config.model_name,
-        quantization_config=bnb_config,
-        attn_implementation="sdpa" if train_config.use_fast_kernels else None,
-        device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
-        torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
-    )
-        processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
-        processor.tokenizer.padding_side='right'
+            train_config.model_name,
+            quantization_config=bnb_config,
+            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
+            device_map=(
+                "auto"
+                if train_config.quantization and not train_config.enable_fsdp
+                else None
+            ),
+            torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
+        )
+        processor = AutoProcessor.from_pretrained(
+            train_config.model_name
+            if train_config.tokenizer_name is None
+            else train_config.tokenizer_name
+        )
+        processor.tokenizer.padding_side = "right"
         model.supports_gradient_checkpointing = True
         model.language_model.supports_gradient_checkpointing = True
     elif config.model_type == "llama":
@@ -143,32 +165,50 @@ def main(**kwargs):
             quantization_config=bnb_config,
             use_cache=use_cache,
             attn_implementation="sdpa" if train_config.use_fast_kernels else None,
-            device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
+            device_map=(
+                "auto"
+                if train_config.quantization and not train_config.enable_fsdp
+                else None
+            ),
             torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
         )
     else:
-        raise ValueError(f"Model type {config.model_type} is not supported. Please use llama or mllama model.")
+        raise ValueError(
+            f"Model type {config.model_type} is not supported. Please use llama or mllama model."
+        )
     # Load the tokenizer and add special tokens
-    tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
-    if not tokenizer.pad_token_id: 
+    tokenizer = AutoTokenizer.from_pretrained(
+        train_config.model_name
+        if train_config.tokenizer_name is None
+        else train_config.tokenizer_name
+    )
+    if not tokenizer.pad_token_id:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-        
+
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        print(
+            "WARNING: Resizing the embedding matrix to match the tokenizer vocab size."
+        )
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model, train_config, rank if train_config.enable_fsdp else 0)
 
     # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled
-    if train_config.enable_fsdp and fsdp_config.pure_bf16 and not train_config.quantization:
+    if (
+        train_config.enable_fsdp
+        and fsdp_config.pure_bf16
+        and not train_config.quantization
+    ):
         model.to(torch.bfloat16)
-        
+
     if train_config.use_peft:
         # Load the pre-trained peft model checkpoint and setup its configuration
         if train_config.from_peft_checkpoint:
-            model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
+            model = PeftModel.from_pretrained(
+                model, train_config.from_peft_checkpoint, is_trainable=True
+            )
             peft_config = model.peft_config
         # Generate the peft config and start fine-tuning from original model
         else:
@@ -179,23 +219,36 @@ def main(**kwargs):
         model.print_trainable_parameters()
 
     hsdp_device_mesh_plan = None
-    if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD:
-        hsdp_device_mesh_plan = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
+    if (
+        fsdp_config.hsdp
+        and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD
+    ):
+        hsdp_device_mesh_plan = hsdp_device_mesh(
+            replica_group_size=fsdp_config.replica_group_size,
+            sharding_group_size=fsdp_config.sharding_group_size,
+        )
         print("HSDP device mesh is ready")
 
-    #setting up FSDP if enable_fsdp is enabled
+    # setting up FSDP if enable_fsdp is enabled
     if train_config.enable_fsdp:
         check_fsdp_config(fsdp_config)
-        
+
         if not train_config.use_peft and train_config.freeze_layers:
             freeze_transformer_layers(model, train_config.num_freeze_layers)
 
         mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
         # Create the FSDP wrapper for MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer in vision models
         if is_vision:
-            my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer])
+            my_auto_wrapping_policy = fsdp_auto_wrap_policy(
+                model,
+                [
+                    MllamaSelfAttentionDecoderLayer,
+                    MllamaSelfAttentionDecoderLayer,
+                    MllamaVisionEncoderLayer,
+                ],
+            )
         else:
-        # Create the FSDP wrapper for LlamaDecoderLayer in text models
+            # Create the FSDP wrapper for LlamaDecoderLayer in text models
             my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [LlamaDecoderLayer])
         device_id = 0
         if is_xpu_available():
@@ -204,21 +257,36 @@ def main(**kwargs):
             device_id = torch.cuda.current_device()
         model = FSDP(
             model,
-            auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy,
-            cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
-            mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
+            auto_wrap_policy=(
+                my_auto_wrapping_policy if train_config.use_peft else wrapping_policy
+            ),
+            cpu_offload=(
+                CPUOffload(offload_params=True)
+                if fsdp_config.fsdp_cpu_offload
+                else None
+            ),
+            mixed_precision=(
+                mixed_precision_policy if not fsdp_config.pure_bf16 else None
+            ),
             sharding_strategy=fsdp_config.sharding_strategy,
             device_mesh=hsdp_device_mesh_plan,
             device_id=device_id,
             limit_all_gathers=True,
             sync_module_states=train_config.low_cpu_fsdp,
-            param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False))
-            if train_config.low_cpu_fsdp and rank != 0 else None,
+            param_init_fn=(
+                (
+                    lambda module: module.to_empty(
+                        device=torch.device("cuda"), recurse=False
+                    )
+                )
+                if train_config.low_cpu_fsdp and rank != 0
+                else None
+            ),
         )
-        if fsdp_config.fsdp_activation_checkpointing:            
+        if fsdp_config.fsdp_activation_checkpointing:
             model.enable_input_require_grads()
             model.gradient_checkpointing_enable()
-            apply_fsdp_checkpointing(model)                      
+            apply_fsdp_checkpointing(model)
     elif not train_config.quantization and not train_config.enable_fsdp:
         if is_xpu_available():
             model.to("xpu:0")
@@ -252,11 +320,15 @@ def main(**kwargs):
         if is_vision:
             raise ValueError("Packing is not supported for vision datasets")
         else:
-            dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
+            dataset_train = ConcatDataset(
+                dataset_train, chunk_size=train_config.context_length
+            )
 
-    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
+    train_dl_kwargs = get_dataloader_kwargs(
+        train_config, dataset_train, dataset_processer, "train"
+    )
     print("length of dataset_train", len(dataset_train))
-    custom_data_collator = get_custom_data_collator(dataset_processer,dataset_config)
+    custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
     if custom_data_collator:
         print("custom_data_collator is used")
         train_dl_kwargs["collate_fn"] = custom_data_collator
@@ -275,9 +347,13 @@ def main(**kwargs):
             if is_vision:
                 raise ValueError("Packing is not supported for vision datasets")
             else:
-                dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
+                dataset_val = ConcatDataset(
+                    dataset_val, chunk_size=train_config.context_length
+                )
 
-        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val")
+        val_dl_kwargs = get_dataloader_kwargs(
+            train_config, dataset_val, dataset_processer, "val"
+        )
         if custom_data_collator:
             val_dl_kwargs["collate_fn"] = custom_data_collator
 
@@ -289,7 +365,9 @@ def main(**kwargs):
         )
         print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
-            raise ValueError(f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})")
+            raise ValueError(
+                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
+            )
         else:
             print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
@@ -324,11 +402,12 @@ def main(**kwargs):
         rank if train_config.enable_fsdp else None,
         wandb_run,
     )
-    if not train_config.enable_fsdp or rank==0:
-        [print(f'Key: {k}, Value: {v}') for k, v in results.items()]
+    if not train_config.enable_fsdp or rank == 0:
+        [print(f"Key: {k}, Value: {v}") for k, v in results.items()]
         if train_config.use_wandb:
-            for k,v in results.items():
+            for k, v in results.items():
                 wandb_run.summary[k] = v
 
+
 if __name__ == "__main__":
     fire.Fire(main)

+ 35 - 19
src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py

@@ -3,14 +3,15 @@
 
 # from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
-import fire
 import os
 import sys
+
+import fire
 import yaml
 
-from transformers import AutoTokenizer
+from llama_recipes.inference.model_utils import load_llama_from_config
 
-from llama_recipes.inference.model_utils import  load_llama_from_config
+from transformers import AutoConfig, AutoTokenizer, MllamaProcessor
 
 # Get the current file's directory
 current_directory = os.path.dirname(os.path.abspath(__file__))
@@ -22,23 +23,24 @@ parent_directory = os.path.dirname(current_directory)
 sys.path.append(parent_directory)
 from model_checkpointing import load_sharded_model_single_gpu
 
+
 def main(
-    fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints
-    consolidated_model_path="", # Path to save the HF converted model checkpoints
-    HF_model_path_or_name="" # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf)
-    ):
-    
+    fsdp_checkpoint_path="",  # Path to FSDP Sharded model checkpoints
+    consolidated_model_path="",  # Path to save the HF converted model checkpoints
+    HF_model_path_or_name="",  # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf)
+):
+
     try:
-        file_name = 'train_params.yaml'
+        file_name = "train_params.yaml"
         # Combine the directory and file name to create the full path
         train_params_path = os.path.join(fsdp_checkpoint_path, file_name)
         # Open the file
-        with open(train_params_path, 'r') as file:
+        with open(train_params_path, "r") as file:
             # Load the YAML data
             data = yaml.safe_load(file)
 
             # Access the 'model_name' field
-            HF_model_path_or_name = data.get('model_name')
+            HF_model_path_or_name = data.get("model_name")
 
             print(f"Model name: {HF_model_path_or_name}")
     except FileNotFoundError:
@@ -47,19 +49,33 @@ def main(
         print(f"Model name: {HF_model_path_or_name}")
     except Exception as e:
         print(f"An error occurred: {e}")
-        
-        
-    #load the HF model definition from config
+
+    # load the HF model definition from config
     model_def = load_llama_from_config(HF_model_path_or_name)
     print("model is loaded from config")
-    #load the FSDP sharded checkpoints into the model
+    # load the FSDP sharded checkpoints into the model
     model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path)
     print("model is loaded from FSDP checkpoints")
-    #loading the tokenizer form the  model_path
-    tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name)
-    tokenizer.save_pretrained(consolidated_model_path)
-    #save the FSDP sharded checkpoints in HF format
+    # loading the tokenizer form the  model_path
+    config = AutoConfig.from_pretrained(HF_model_path_or_name)
+    # save the processor and config for mllama models
+    if config.model_type == "mllama":
+        processor = MllamaProcessor.from_pretrained(HF_model_path_or_name)
+        processor.save_pretrained(consolidated_model_path)
+        print(
+            f"HuggingFace mllama processor has been saved in {consolidated_model_path}"
+        )
+    else:
+        # save the tokenizer for llama models
+        tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name)
+        tokenizer.save_pretrained(consolidated_model_path)
+        print(
+            f"HuggingFace llama tokenizer has been saved in {consolidated_model_path}"
+        )
+    # save the FSDP sharded checkpoints in HF format
     model.save_pretrained(consolidated_model_path)
     print(f"HuggingFace model checkpoints has been saved in {consolidated_model_path}")
+
+
 if __name__ == "__main__":
     fire.Fire(main)