1 år sedan · bb990be967
--- a/recipes/quickstart/finetuning/datasets/vqa_dataset.py
+++ b/recipes/quickstart/finetuning/datasets/vqa_dataset.py
@@ -5,6 +5,7 @@
 
				 import copy
			
 
				 from datasets import load_dataset
			
 
				 import itertools
			
 
				+import torch
			
 
				 # check system prompt token seq or user prompt token seq is in the current token list
			
 
				 def check_header(targets,seq):
			
 
				     for i in range(len(seq)-3):
			
@@ -20,13 +21,8 @@ def tokenize_dialog(dialog, images, processor):
 
				     # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
			
 
				     text_prompt = processor.apply_chat_template(dialog)
			
 
				     #print("text_prompt",text_prompt)
			
 
				-    batch = processor(images=images, text=text_prompt)
			
 
				-    dialog_tokens = batch["input_ids"].tolist()[0]
			
 
				-    #print("dialog_tokens",dialog_tokens)
			
 
				-    #print("dialog_tokens",dialog_tokens)
			
 
				-    attention_mask = batch["attention_mask"].tolist()[0]
			
 
				-    #print("attention_mask",attention_mask)
			
 
				-    labels = copy.copy(dialog_tokens)
			
 
				+    batch = processor(images=images, text=text_prompt,padding = True, return_tensors="pt")    
			
 
				+    labels = copy.copy(batch["input_ids"].tolist()[0])
			
 
				     eot_indices = [i for i,n in enumerate(labels) if n == 128009]
			
 
				     last_idx = 0
			
 
				     # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
			
@@ -43,17 +39,26 @@ def tokenize_dialog(dialog, images, processor):
 
				     assistant_header_seq = [128006, 78191, 128007]
			
 
				     labels = replace_target(assistant_header_seq,labels)
			
 
				     #print("labels",labels)
			
 
				+    # print("pixel_values .shape",batch["pixel_values"].shape)
			
 
				+    # print("batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape")
			
 
				 
			
 
				-
			
 
				-    combined_tokens = {
			
 
				-        # "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
			
 
				-        # "labels": list(itertools.chain(*(t for t in labels_tokens))),
			
 
				-        "input_ids": dialog_tokens,
			
 
				-        "labels": labels,
			
 
				-        "attention_mask": [1]*len(dialog_tokens),
			
 
				-        "pixel_values": batch["pixel_values"].tolist()[0],
			
 
				-        "image_sizes": batch["image_sizes"].tolist()[0]
			
 
				-    }
			
 
				+    batch["labels"] = torch.tensor(labels)
			
 
				+    #pixel_values .shape torch.Size([1, 1, 4, 3, 560, 560])
			
 
				+    batch["pixel_values"] = torch.squeeze(batch["pixel_values"], 1)
			
 
				+    # pixel_values .shape torch.Size([1, 4, 3, 560, 560])
			
 
				+    print("pixel_values .shape",batch["pixel_values"].shape)
			
 
				+    # exit()
			
 
				+    # combined_tokens = {
			
 
				+    #     # "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
			
 
				+    #     # "labels": list(itertools.chain(*(t for t in labels_tokens))),
			
 
				+    #     "input_ids": dialog_tokens,
			
 
				+    #     "labels": labels,
			
 
				+    #     "attention_mask": [1]*len(dialog_tokens),
			
 
				+    #     "pixel_values": batch["pixel_values"],
			
 
				+    #     "aspect_ratio_ids": batch["aspect_ratio_ids"],
			
 
				+    #     "aspect_ratio_mask": batch["aspect_ratio_mask"],
			
 
				+    #     "cross_attention_mask": batch["cross_attention_mask"]
			
 
				+    # }
			
 
				     # input_ids =  list(itertools.chain(*(t for t in dialog_tokens))),
			
 
				     # labels = list(itertools.chain(*(t for t in labels_tokens))),
			
 
				     # attention_mask = [1]*len(list(itertools.chain(*(t for t in dialog_tokens)))),
			
@@ -61,7 +66,7 @@ def tokenize_dialog(dialog, images, processor):
 
				     # image_sizes = batch["image_sizes"]
			
 
				 #    print("combined_tokens",combined_tokens[image_sizes])
			
 
				     
			
 
				-    return combined_tokens
			
 
				+    return batch
			
 
				 def image_tokenize(sample, processor):
			
 
				     processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
			
 
				     images,sample_text = sample["images"],sample["messages"]
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -26,11 +26,8 @@ from transformers import (
 
				     BitsAndBytesConfig,
			
 
				     LlamaForCausalLM,
			
 
				     LlamaConfig,
			
 
				-    AutoConfig, 
			
 
				-    AutoModel,
			
 
				-    LlavaNextForConditionalGeneration,
			
 
				-    LlavaNextProcessor
			
 
				-
			
 
				+    AutoProcessor, 
			
 
				+    MllamaForConditionalGeneration
			
 
				 )
			
 
				 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
			
 
				 from transformers.models.clip.modeling_clip import CLIPEncoder, CLIPEncoderLayer
			
@@ -126,7 +123,9 @@ def main(**kwargs):
 
				 
			
 
				     # Load the pre-trained model and setup its configuration
			
 
				     use_cache = False if train_config.enable_fsdp else None
			
 
				-    model = LlavaNextForConditionalGeneration.from_pretrained(
			
 
				+    if "11B" in train_config.model_name or "90B" in train_config.model_name:
			
 
				+        is_vision = True
			
 
				+        model = MllamaForConditionalGeneration.from_pretrained(
			
 
				         train_config.model_name,
			
 
				         quantization_config=bnb_config,
			
 
				         #use_cache=use_cache,
			
@@ -134,12 +133,22 @@ def main(**kwargs):
 
				         device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
			
 
				         torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
			
 
				     )
			
 
				+        processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				+        processor.tokenizer.padding_side='right'
			
 
				+    else:
			
 
				+        model = LlamaForCausalLM.from_pretrained(
			
 
				+            train_config.model_name,
			
 
				+            quantization_config=bnb_config,
			
 
				+            use_cache=use_cache,
			
 
				+            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
			
 
				+            device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
			
 
				+            torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
			
 
				+        )
			
 
				 
			
 
				     # Load the tokenizer and add special tokens
			
 
				     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				     tokenizer.pad_token_id = tokenizer.eos_token_id
			
 
				-    processor = LlavaNextProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				-    processor.tokenizer.padding_side='right'
			
 
				+        
			
 
				     # If there is a mismatch between tokenizer vocab size and embedding matrix,
			
 
				     # throw a warning and then expand the embedding matrix
			
 
				     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
			
@@ -183,18 +192,16 @@ def main(**kwargs):
 
				             device_id = torch.xpu.current_device()
			
 
				         elif torch.cuda.is_available():
			
 
				             device_id = torch.cuda.current_device()
			
 
				-        # print(dir(model))
			
 
				-        # for layer in model.named_children():
			
 
				-        #     print(f"Layer: {layer}")
			
 
				-            
			
 
				-        # layernorm = model.CLIPVisionTransformer.CLIPEncoder.LayerNorm
			
 
				-        # for name, param in layernorm.named_parameters():
			
 
				-        #     print(f"Parameter: {name}, Shape: {param.shape}, Dtype: {param.dtype}")
			
 
				-        # exit()
			
 
				+        if train_config.use_peft:
			
 
				+            wrapping_policy = my_auto_wrapping_policy
			
 
				+        else:
			
 
				+            if is_vision:
			
 
				+                wrapping_policy = ModuleWrapPolicy([CLIPEncoderLayer, LlamaDecoderLayer])
			
 
				+            else:
			
 
				+                wrapping_policy = ModuleWrapPolicy([LlamaDecoderLayer])
			
 
				         model = FSDP(
			
 
				             model,
			
 
				-            auto_wrap_policy= ModuleWrapPolicy([CLIPEncoderLayer, LlamaDecoderLayer]),
			
 
				-            #auto_wrap_policy= my_auto_wrapping_policy, #if train_config.use_peft else wrapping_policy,
			
 
				+            auto_wrap_policy= wrapping_policy,
			
 
				             cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
			
 
				             mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
			
 
				             sharding_strategy=fsdp_config.sharding_strategy,
			
@@ -205,10 +212,9 @@ def main(**kwargs):
 
				             param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False))
			
 
				             if train_config.low_cpu_fsdp and rank != 0 else None,
			
 
				         )
			
 
				-        #print(model)
			
 
				         if fsdp_config.fsdp_activation_checkpointing:            
			
 
				             model.enable_input_require_grads()
			
 
				-            model.gradient_checkpointing_enable()
			
 
				+            #model.gradient_checkpointing_enable()
			
 
				             apply_fsdp_checkpointing(model)                      
			
 
				     elif not train_config.quantization and not train_config.enable_fsdp:
			
 
				         if is_xpu_available():
			
@@ -217,15 +223,15 @@ def main(**kwargs):
 
				             model.to("cuda")
			
 
				 
			
 
				     dataset_config = generate_dataset_config(train_config, kwargs)
			
 
				+    if is_vision:
			
 
				+        dataset_processer = processor
			
 
				+    else:
			
 
				+        dataset_processer = tokenizer
			
 
				+
			
 
				+    # Load and preprocess the dataset for training and validation
			
 
				 
			
 
				-     # Load and preprocess the dataset for training and validation
			
 
				-    # dataset_train = get_preprocessed_dataset(
			
 
				-    #     processor,
			
 
				-    #     dataset_config,
			
 
				-    #     split="train",
			
 
				-    # )
			
 
				     dataset_train = get_preprocessed_dataset(
			
 
				-        processor,
			
 
				+        dataset_processer,
			
 
				         dataset_config,
			
 
				         split="train",
			
 
				     )
			
@@ -233,7 +239,7 @@ def main(**kwargs):
 
				         print(f"--> Training Set Length = {len(dataset_train)}")
			
 
				 
			
 
				     dataset_val = get_preprocessed_dataset(
			
 
				-        processor,
			
 
				+        dataset_processer,
			
 
				         dataset_config,
			
 
				         split="test",
			
 
				     )
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -75,7 +75,7 @@ def generate_dataset_config(train_config, kwargs):
 
				     return  dataset_config
			
 
				 
			
 
				 
			
 
				-def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
			
 
				+def get_dataloader_kwargs(train_config, dataset, tokenizer, mode,collate_fn=None):
			
 
				         kwargs = {}
			
 
				         batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size
			
 
				         if train_config.batching_strategy == "padding":
			
@@ -89,7 +89,10 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
 
				                 )
			
 
				             else:
			
 
				                 kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train")
			
 
				-            kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer)
			
 
				+            if not collate_fn:
			
 
				+                kwargs["collate_fn"] = collate_fn
			
 
				+            else:
			
 
				+                kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer)
			
 
				         elif train_config.batching_strategy == "packing":
			
 
				             if train_config.enable_fsdp:
			
 
				                 kwargs["sampler"] = DistributedSampler(
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -154,6 +154,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				                     with autocast():
			
 
				                         assert(next(model.parameters()).device == batch['input_ids'].device)
			
 
				                         #print("batch: ", batch)
			
 
				+                        pixel_values = batch['pixel_values']
			
 
				+                        print("pixel_values.shape input",pixel_values.shape)
			
 
				                         loss = model(**batch).loss
			
 
				                     loss = loss / gradient_accumulation_steps
			
 
				                     #print("loss",loss)