il y a 1 an · e1bbffcbff
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1451,3 +1451,6 @@ openhathi
 
				 sarvam
			
 
				 subtask
			
 
				 acc
			
 
				+OCRVQA
			
 
				+OCRVQADataCollator
			
 
				+ocrvqa
			
--- a/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py
+++ b/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py
@@ -0,0 +1,90 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				+
			
 
				+
			
 
				+import copy
			
 
				+from datasets import load_dataset
			
 
				+import itertools
			
 
				+import torch
			
 
				+
			
 
				+# check system prompt token seq or user prompt token seq is in the current token list
			
 
				+def check_header(targets,seq):
			
 
				+    for i in range(len(seq)-3):
			
 
				+        if seq[i:i+3] in targets:
			
 
				+            return True
			
 
				+    return False
			
 
				+def replace_target(target,seq):
			
 
				+    for i in range(len(seq)-3):
			
 
				+        if seq[i:i+3] == target:
			
 
				+            seq[i],seq[i+1],seq[i+2] = -100,-100,-100
			
 
				+    return seq
			
 
				+def tokenize_dialogs(dialogs, images, processor):
			
 
				+    text_prompt = processor.apply_chat_template(dialogs)
			
 
				+    batch = processor(images=images, text=text_prompt,padding = True, return_tensors="pt")
			
 
				+    label_list = []
			
 
				+    for i in range(len(batch["input_ids"])):
			
 
				+        dialog_tokens = batch["input_ids"][i].tolist()
			
 
				+        labels = copy.copy(dialog_tokens)
			
 
				+        eot_indices = [i for i,n in enumerate(labels) if n == 128009]
			
 
				+        last_idx = 0
			
 
				+        # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
			
 
				+        # user prompt header "<|start_header_id|>user<|end_header_id|>" has been tokenized to [128006, 882, 128007]
			
 
				+        prompt_header_seqs = [[128006, 9125, 128007],[128006, 882, 128007]]
			
 
				+        for n, idx in enumerate(eot_indices):
			
 
				+            current_seq = labels[last_idx:idx+1]
			
 
				+            if check_header(prompt_header_seqs,current_seq):
			
 
				+                # found prompt header, indicating that this seq should be masked
			
 
				+                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
			
 
				+            else:
			
 
				+                last_idx = idx+1
			
 
				+            #  Mask all the assistant header prompt <|start_header_id|>assistant<|end_header_id|>, which has been tokenized to [128006, 78191, 128007]
			
 
				+        assistant_header_seq = [128006, 78191, 128007]
			
 
				+        labels = replace_target(assistant_header_seq,labels)
			
 
				+        # Mask the padding token and image token 128256 
			
 
				+        for i in range(len(labels)):
			
 
				+            if labels[i] == processor.tokenizer.pad_token_id or labels[i] == 128256: #  128256 is image token index
			
 
				+                labels[i] = -100
			
 
				+        label_list.append(labels)
			
 
				+    batch["labels"] = torch.tensor(label_list)
			
 
				+    return batch
			
 
				+
			
 
				+
			
 
				+def get_custom_dataset(dataset_config, processor, split, split_ratio=0.9):
			
 
				+    # load_dataset will return DatasetDict that contains all the data in the train set
			
 
				+    dataset_dict = load_dataset("HuggingFaceM4/the_cauldron", name="ocrvqa")
			
 
				+    dataset = dataset_dict['train']
			
 
				+    # Comment out the following line to use the full dataset, for quick testing only use 2000 samples
			
 
				+    dataset = dataset.select(range(2000))
			
 
				+    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)[split]
			
 
				+    return dataset
			
 
				+
			
 
				+class OCRVQADataCollator:
			
 
				+    def __init__(self, processor):
			
 
				+        self.processor = processor
			
 
				+        self.processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
			
 
				+    def __call__(self, samples):
			
 
				+        dialogs,images = [],[]
			
 
				+        for sample in samples:
			
 
				+            image_list,sample_list = sample["images"],sample["texts"]
			
 
				+            if len(image_list) > 1:
			
 
				+                raise ValueError("Only support one image per sample")
			
 
				+            image = image_list[0].convert("RGB") # only use the first image
			
 
				+            dialog = []
			
 
				+            for sample_dict in sample_list:
			
 
				+                if not dialog:
			
 
				+                    # only append image to the first sentence
			
 
				+                    dialog += [
			
 
				+                    {"role":"user","content":[{"type": "image"},{"type": "text", "text": sample_dict["user"].strip()}]},
			
 
				+                    {"role":"assistant","content":[{"type": "text", "text": sample_dict["assistant"].strip()}]}
			
 
				+                ]
			
 
				+                
			
 
				+                else:
			
 
				+                    dialog += [
			
 
				+                    {"role":"user","content":[{"type": "text", "text": sample_dict["user"].strip()}]},
			
 
				+                    {"role":"assistant","content":[{"type": "text", "text": sample_dict["assistant"].strip()}]}
			
 
				+                ]
			
 
				+            dialogs.append(dialog)
			
 
				+            images.append([image])
			
 
				+        return tokenize_dialogs(dialogs,images, self.processor)
			
 
				+def get_data_collator(processor):
			
 
				+    return OCRVQADataCollator(processor)
			
--- a/recipes/quickstart/finetuning/finetune_vision_model.md
+++ b/recipes/quickstart/finetuning/finetune_vision_model.md
--- a/src/llama_recipes/datasets/__init__.py
+++ b/src/llama_recipes/datasets/__init__.py
@@ -5,14 +5,16 @@ from functools import partial
 
				 
			
 
				 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
			
 
				 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
			
 
				-from llama_recipes.datasets.custom_dataset import get_custom_dataset
			
 
				+from llama_recipes.datasets.custom_dataset import get_custom_dataset,get_data_collator
			
 
				 from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
			
 
				 from llama_recipes.datasets.toxicchat_dataset import get_llamaguard_toxicchat_dataset as get_llamaguard_toxicchat_dataset
			
 
				-
			
 
				 DATASET_PREPROC = {
			
 
				     "alpaca_dataset": partial(get_alpaca_dataset),
			
 
				     "grammar_dataset": get_grammar_dataset,
			
 
				     "samsum_dataset": get_samsum_dataset,
			
 
				     "custom_dataset": get_custom_dataset,
			
 
				     "llamaguard_toxicchat_dataset": get_llamaguard_toxicchat_dataset,
			
 
				-}
			
 
				+}
			
 
				+DATALOADER_COLLATE_FUNC = {
			
 
				+    "custom_dataset": get_data_collator
			
 
				+}
			
--- a/src/llama_recipes/datasets/custom_dataset.py
+++ b/src/llama_recipes/datasets/custom_dataset.py
@@ -35,3 +35,23 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
 
				         print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
			
 
				         raise e
			
 
				 
			
 
				+def get_data_collator(dataset_processer,dataset_config):
			
 
				+    if ":" in dataset_config.file:
			
 
				+        module_path, func_name = dataset_config.file.split(":")
			
 
				+    else:
			
 
				+        module_path, func_name = dataset_config.file, "get_data_collator"
			
 
				+
			
 
				+    if not module_path.endswith(".py"):
			
 
				+        raise ValueError(f"Dataset file {module_path} is not a .py file.")
			
 
				+
			
 
				+    module_path = Path(module_path)
			
 
				+    if not module_path.is_file():
			
 
				+        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
			
 
				+
			
 
				+    module = load_module_from_py_file(module_path.as_posix())
			
 
				+    try:
			
 
				+        return getattr(module, func_name)(dataset_processer)
			
 
				+    except AttributeError as e:
			
 
				+        print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
			
 
				+        print("Using the default data_collator instead.")
			
 
				+        return None
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -14,16 +14,18 @@ from torch.distributed.fsdp import (
 
				     FullyShardedDataParallel as FSDP,
			
 
				     ShardingStrategy
			
 
				 )
			
 
				-
			
 
				 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
			
 
				 from torch.optim.lr_scheduler import StepLR
			
 
				 from transformers import (
			
 
				+    AutoConfig,
			
 
				     AutoTokenizer,
			
 
				     BitsAndBytesConfig,
			
 
				-    LlamaForCausalLM,
			
 
				-    LlamaConfig,
			
 
				+    AutoProcessor, 
			
 
				+    MllamaForConditionalGeneration,
			
 
				+    AutoModel,
			
 
				 )
			
 
				 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
			
 
				+from transformers.models.mllama.modeling_mllama import  MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
			
 
				 
			
 
				 from llama_recipes.configs import fsdp_config as FSDP_CONFIG
			
 
				 from llama_recipes.configs import train_config as TRAIN_CONFIG
			
@@ -39,7 +41,7 @@ from llama_recipes.utils.config_utils import (
 
				     get_dataloader_kwargs,
			
 
				     check_fsdp_config,
			
 
				 )
			
 
				-from llama_recipes.utils.dataset_utils import get_preprocessed_dataset
			
 
				+from llama_recipes.utils.dataset_utils import get_preprocessed_dataset,get_custom_data_collator
			
 
				 
			
 
				 from llama_recipes.utils.fsdp_utils import hsdp_device_mesh
			
 
				 from llama_recipes.utils.train_utils import (
			
@@ -118,19 +120,35 @@ def main(**kwargs):
 
				 
			
 
				     # Load the pre-trained model and setup its configuration
			
 
				     use_cache = False if train_config.enable_fsdp else None
			
 
				-    model = LlamaForCausalLM.from_pretrained(
			
 
				+    config = AutoConfig.from_pretrained(train_config.model_name)
			
 
				+    if config.model_type == "mllama":
			
 
				+        is_vision = True
			
 
				+        model = MllamaForConditionalGeneration.from_pretrained(
			
 
				         train_config.model_name,
			
 
				         quantization_config=bnb_config,
			
 
				-        use_cache=use_cache,
			
 
				         attn_implementation="sdpa" if train_config.use_fast_kernels else None,
			
 
				         device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
			
 
				         torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
			
 
				     )
			
 
				-
			
 
				+        processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				+        processor.tokenizer.padding_side='right'
			
 
				+    elif config.model_type == "llama":
			
 
				+        is_vision = False
			
 
				+        model = AutoModel.from_pretrained(
			
 
				+            train_config.model_name,
			
 
				+            quantization_config=bnb_config,
			
 
				+            use_cache=use_cache,
			
 
				+            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
			
 
				+            device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
			
 
				+            torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
			
 
				+        )
			
 
				+    else:
			
 
				+        raise ValueError(f"Model type {config.model_type} is not supported. Please use llama or mllama model.")
			
 
				     # Load the tokenizer and add special tokens
			
 
				     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				-    tokenizer.pad_token_id = tokenizer.eos_token_id
			
 
				-
			
 
				+    if not tokenizer.pad_token_id: 
			
 
				+        tokenizer.pad_token_id = tokenizer.eos_token_id
			
 
				+        
			
 
				     # If there is a mismatch between tokenizer vocab size and embedding matrix,
			
 
				     # throw a warning and then expand the embedding matrix
			
 
				     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
			
@@ -169,8 +187,12 @@ def main(**kwargs):
 
				             freeze_transformer_layers(model, train_config.num_freeze_layers)
			
 
				 
			
 
				         mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
			
 
				-        my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, LlamaDecoderLayer)
			
 
				-
			
 
				+        # Create the FSDP wrapper for MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer in vision models
			
 
				+        if is_vision:
			
 
				+            my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer])
			
 
				+        else:
			
 
				+        # Create the FSDP wrapper for LlamaDecoderLayer in text models
			
 
				+            my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [LlamaDecoderLayer])
			
 
				         device_id = 0
			
 
				         if is_xpu_available():
			
 
				             device_id = torch.xpu.current_device()
			
@@ -198,12 +220,16 @@ def main(**kwargs):
 
				             model.to("xpu:0")
			
 
				         elif torch.cuda.is_available():
			
 
				             model.to("cuda")
			
 
				-
			
 
				     dataset_config = generate_dataset_config(train_config, kwargs)
			
 
				+    if is_vision:
			
 
				+        dataset_processer = processor
			
 
				+    else:
			
 
				+        dataset_processer = tokenizer
			
 
				+
			
 
				+    # Load and preprocess the dataset for training and validation
			
 
				 
			
 
				-     # Load and preprocess the dataset for training and validation
			
 
				     dataset_train = get_preprocessed_dataset(
			
 
				-        tokenizer,
			
 
				+        dataset_processer,
			
 
				         dataset_config,
			
 
				         split="train",
			
 
				     )
			
@@ -211,7 +237,7 @@ def main(**kwargs):
 
				         print(f"--> Training Set Length = {len(dataset_train)}")
			
 
				 
			
 
				     dataset_val = get_preprocessed_dataset(
			
 
				-        tokenizer,
			
 
				+        dataset_processer,
			
 
				         dataset_config,
			
 
				         split="test",
			
 
				     )
			
@@ -219,10 +245,17 @@ def main(**kwargs):
 
				         print(f"--> Validation Set Length = {len(dataset_val)}")
			
 
				 
			
 
				     if train_config.batching_strategy == "packing":
			
 
				-        dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
			
 
				-
			
 
				-    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, tokenizer, "train")
			
 
				-
			
 
				+        if is_vision:
			
 
				+            raise ValueError("Packing is not supported for vision datasets")
			
 
				+        else:
			
 
				+            dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
			
 
				+
			
 
				+    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
			
 
				+    print("length of dataset_train", len(dataset_train))
			
 
				+    custom_data_collator = get_custom_data_collator(dataset_processer,dataset_config)
			
 
				+    if custom_data_collator:
			
 
				+        print("custom_data_collator is used")
			
 
				+        train_dl_kwargs["collate_fn"] = custom_data_collator
			
 
				     # Create DataLoaders for the training and validation dataset
			
 
				     train_dataloader = torch.utils.data.DataLoader(
			
 
				         dataset_train,
			
@@ -230,13 +263,19 @@ def main(**kwargs):
 
				         pin_memory=True,
			
 
				         **train_dl_kwargs,
			
 
				     )
			
 
				+    print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
			
 
				 
			
 
				     eval_dataloader = None
			
 
				     if train_config.run_validation:
			
 
				         if train_config.batching_strategy == "packing":
			
 
				-            dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
			
 
				+            if is_vision:
			
 
				+                raise ValueError("Packing is not supported for vision datasets")
			
 
				+            else:
			
 
				+                dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
			
 
				 
			
 
				-        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, tokenizer, "val")
			
 
				+        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val")
			
 
				+        if custom_data_collator:
			
 
				+            val_dl_kwargs["collate_fn"] = custom_data_collator
			
 
				 
			
 
				         eval_dataloader = torch.utils.data.DataLoader(
			
 
				             dataset_val,
			
@@ -244,6 +283,7 @@ def main(**kwargs):
 
				             pin_memory=True,
			
 
				             **val_dl_kwargs,
			
 
				         )
			
 
				+        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
			
 
				         if len(eval_dataloader) == 0:
			
 
				             raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.")
			
 
				         else:
			
@@ -266,7 +306,6 @@ def main(**kwargs):
 
				             weight_decay=train_config.weight_decay,
			
 
				         )
			
 
				     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
			
 
				-    # Start the training process
			
 
				     results = train(
			
 
				         model,
			
 
				         train_dataloader,
			
--- a/src/llama_recipes/policies/wrapping.py
+++ b/src/llama_recipes/policies/wrapping.py
@@ -4,6 +4,8 @@
 
				 import functools
			
 
				 
			
 
				 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
			
 
				+from transformers.models.mllama.modeling_mllama import   MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
			
 
				+
			
 
				 from torch.distributed.fsdp.wrap import (
			
 
				     transformer_auto_wrap_policy,
			
 
				     size_based_auto_wrap_policy,
			
@@ -25,9 +27,7 @@ def get_llama_wrapper():
 
				 
			
 
				     llama_auto_wrap_policy = functools.partial(
			
 
				         transformer_auto_wrap_policy,
			
 
				-        transformer_layer_cls={
			
 
				-            LlamaDecoderLayer,
			
 
				-        },
			
 
				+        transformer_layer_cls=set([LlamaDecoderLayer, MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer,MllamaCrossAttentionDecoderLayer])
			
 
				     )
			
 
				 
			
 
				     return llama_auto_wrap_policy
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -17,8 +17,7 @@ from transformers.data import DataCollatorForSeq2Seq
 
				 
			
 
				 from llama_recipes.configs import datasets, lora_config, llama_adapter_config, prefix_config, train_config
			
 
				 from llama_recipes.data.sampler import LengthBasedBatchSampler, DistributedLengthBasedBatchSampler
			
 
				-from llama_recipes.utils.dataset_utils import DATASET_PREPROC
			
 
				-
			
 
				+from llama_recipes.datasets import DATASET_PREPROC
			
 
				 
			
 
				 def update_config(config, **kwargs):
			
 
				     if isinstance(config, (tuple, list)):
			
@@ -76,37 +75,36 @@ def generate_dataset_config(train_config, kwargs):
 
				     return  dataset_config
			
 
				 
			
 
				 
			
 
				-def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
			
 
				-        kwargs = {}
			
 
				-        batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size
			
 
				-        if train_config.batching_strategy == "padding":
			
 
				-            if train_config.enable_fsdp:
			
 
				-                kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
			
 
				-                    dataset,
			
 
				-                    batch_size=batch_size,
			
 
				-                    rank=dist.get_rank(),
			
 
				-                    num_replicas=dist.get_world_size(),
			
 
				-                    shuffle=mode=="train",
			
 
				-                )
			
 
				-            else:
			
 
				-                kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train")
			
 
				-            kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer)
			
 
				-        elif train_config.batching_strategy == "packing":
			
 
				-            if train_config.enable_fsdp:
			
 
				-                kwargs["sampler"] = DistributedSampler(
			
 
				+def get_dataloader_kwargs(train_config, dataset, dataset_processer, mode):
			
 
				+    kwargs = {}
			
 
				+    batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size
			
 
				+    if train_config.batching_strategy == "padding":
			
 
				+        if train_config.enable_fsdp:
			
 
				+            kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
			
 
				                 dataset,
			
 
				+                batch_size=batch_size,
			
 
				                 rank=dist.get_rank(),
			
 
				                 num_replicas=dist.get_world_size(),
			
 
				                 shuffle=mode=="train",
			
 
				-                drop_last=True,
			
 
				             )
			
 
				-            kwargs["batch_size"] = batch_size
			
 
				-            kwargs["drop_last"] = True
			
 
				-            kwargs["collate_fn"] = default_data_collator
			
 
				         else:
			
 
				-            raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}")
			
 
				-
			
 
				-        return kwargs
			
 
				+            kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train")
			
 
				+        kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
			
 
				+    elif train_config.batching_strategy == "packing":
			
 
				+        if train_config.enable_fsdp:
			
 
				+            kwargs["sampler"] = DistributedSampler(
			
 
				+            dataset,
			
 
				+            rank=dist.get_rank(),
			
 
				+            num_replicas=dist.get_world_size(),
			
 
				+            shuffle=mode=="train",
			
 
				+            drop_last=True,
			
 
				+        )
			
 
				+        kwargs["batch_size"] = batch_size
			
 
				+        kwargs["drop_last"] = True
			
 
				+        kwargs["collate_fn"] = default_data_collator
			
 
				+    else:
			
 
				+        raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}")
			
 
				+    return kwargs
			
 
				 
			
 
				 
			
 
				 def check_fsdp_config(fsdp_config):
			
--- a/src/llama_recipes/utils/dataset_utils.py
+++ b/src/llama_recipes/utils/dataset_utils.py
@@ -4,7 +4,7 @@
 
				 import torch
			
 
				 
			
 
				 from llama_recipes.data.concatenator import ConcatDataset
			
 
				-from llama_recipes.datasets import DATASET_PREPROC, get_custom_dataset
			
 
				+from llama_recipes.datasets import DATASET_PREPROC, DATALOADER_COLLATE_FUNC
			
 
				 from llama_recipes.utils.config_utils import get_dataloader_kwargs
			
 
				 
			
 
				 
			
@@ -27,6 +27,16 @@ def get_preprocessed_dataset(
 
				         get_split(),
			
 
				     )
			
 
				 
			
 
				+def get_custom_data_collator(
			
 
				+    dataset_processer, dataset_config
			
 
				+) -> torch.utils.data.Dataset:
			
 
				+    if not dataset_config.dataset in DATALOADER_COLLATE_FUNC:
			
 
				+        return None
			
 
				+
			
 
				+    return DATALOADER_COLLATE_FUNC[dataset_config.dataset](
			
 
				+        dataset_processer,
			
 
				+        dataset_config
			
 
				+    )
			
 
				 
			
 
				 def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"):
			
 
				     dataset = get_preprocessed_dataset(tokenizer, dataset_config, split)
			
--- a/src/llama_recipes/utils/fsdp_utils.py
+++ b/src/llama_recipes/utils/fsdp_utils.py
@@ -3,7 +3,7 @@
 
				 from torch.distributed._tensor.device_mesh import init_device_mesh
			
 
				 import os 
			
 
				 
			
 
				-def fsdp_auto_wrap_policy(model, transformer_layer_name):
			
 
				+def fsdp_auto_wrap_policy(model, transformer_layer_names):
			
 
				     import functools
			
 
				 
			
 
				     from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
			
@@ -20,9 +20,7 @@ def fsdp_auto_wrap_policy(model, transformer_layer_name):
 
				     lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
			
 
				     transformer_wrap_policy = functools.partial(
			
 
				         transformer_auto_wrap_policy,
			
 
				-        transformer_layer_cls=(
			
 
				-            transformer_layer_name,
			
 
				-        ),
			
 
				+        transformer_layer_cls=set(transformer_layer_names)
			
 
				     )
			
 
				 
			
 
				     auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -118,6 +118,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				     max_steps_reached = False  # Flag to indicate max training steps reached
			
 
				     # Start the training loop
			
 
				     for epoch in range(train_config.num_epochs):
			
 
				+        print(f"Starting epoch {epoch}/{train_config.num_epochs}")
			
 
				+        print(f"train_config.max_train_step: {train_config.max_train_step}")
			
 
				         # stop when the maximum number of training steps is reached
			
 
				         if max_steps_reached:
			
 
				             break
			
@@ -143,10 +145,9 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				                             else:
			
 
				                                 batch[key] = batch[key].to(local_rank)
			
 
				                         else:
			
 
				-
			
 
				                             if is_xpu_available():
			
 
				                                 batch[key] = batch[key].to('xpu:0')
			
 
				-                            else:
			
 
				+                            elif torch.cuda.is_available():
			
 
				                                 batch[key] = batch[key].to('cuda:0')
			
 
				                     with autocast():
			
 
				                         loss = model(**batch).loss