1 year ago · b566582a86
--- a/recipes/quickstart/finetuning/datasets/vqa_dataset.py
+++ b/recipes/quickstart/finetuning/datasets/vqa_dataset.py
@@ -0,0 +1,89 @@
 
																+# Copyright (c) Meta Platforms, Inc. and affiliates.
															
 
																+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
															
 
																+
															
 
																+
															
 
																+import copy
															
 
																+from datasets import load_dataset
															
 
																+import itertools
															
 
																+# check system prompt token seq or user prompt token seq is in the current token list
															
 
																+def check_header(targets,seq):
															
 
																+    for i in range(len(seq)-3):
															
 
																+        if seq[i:i+3] in targets:
															
 
																+            return True
															
 
																+    return False
															
 
																+def replace_target(target,seq):
															
 
																+    for i in range(len(seq)-3):
															
 
																+        if seq[i:i+3] == target:
															
 
																+            seq[i],seq[i+1],seq[i+2] = -100,-100,-100
															
 
																+    return seq
															
 
																+def tokenize_dialog(dialog, images, processor):
															
 
																+    # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
															
 
																+    text_prompt = processor.apply_chat_template(dialog)
															
 
																+    #print("text_prompt",text_prompt)
															
 
																+    batch = processor(images=images, text=text_prompt)
															
 
																+    dialog_tokens = batch["input_ids"].tolist()[0]
															
 
																+    #print("dialog_tokens",dialog_tokens)
															
 
																+    #print("dialog_tokens",dialog_tokens)
															
 
																+    attention_mask = batch["attention_mask"].tolist()[0]
															
 
																+    #print("attention_mask",attention_mask)
															
 
																+    labels = copy.copy(dialog_tokens)
															
 
																+    eot_indices = [i for i,n in enumerate(labels) if n == 128009]
															
 
																+    last_idx = 0
															
 
																+    # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
															
 
																+    # user prompt header "<|start_header_id|>user<|end_header_id|>" has been tokenized to [128006, 882, 128007]
															
 
																+    prompt_header_seqs = [[128006, 9125, 128007],[128006, 882, 128007]]
															
 
																+    for n, idx in enumerate(eot_indices):
															
 
																+        current_seq = labels[last_idx:idx+1]
															
 
																+        if check_header(prompt_header_seqs,current_seq):
															
 
																+            # found prompt header, indicating that this seq should be masked
															
 
																+            labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
															
 
																+        else:
															
 
																+            last_idx = idx
															
 
																+        # Lastly mask all the assistant header prompt <|start_header_id|>assistant<|end_header_id|>, which has been tokenized to [128006, 78191, 128007]
															
 
																+    assistant_header_seq = [128006, 78191, 128007]
															
 
																+    labels = replace_target(assistant_header_seq,labels)
															
 
																+    #print("labels",labels)
															
 
																+
															
 
																+
															
 
																+    combined_tokens = {
															
 
																+        # "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
															
 
																+        # "labels": list(itertools.chain(*(t for t in labels_tokens))),
															
 
																+        "input_ids": dialog_tokens,
															
 
																+        "labels": labels,
															
 
																+        "attention_mask": [1]*len(dialog_tokens),
															
 
																+        "pixel_values": batch["pixel_values"].tolist()[0],
															
 
																+        "image_sizes": batch["image_sizes"].tolist()[0]
															
 
																+    }
															
 
																+    # input_ids =  list(itertools.chain(*(t for t in dialog_tokens))),
															
 
																+    # labels = list(itertools.chain(*(t for t in labels_tokens))),
															
 
																+    # attention_mask = [1]*len(list(itertools.chain(*(t for t in dialog_tokens)))),
															
 
																+    # pixel_values =  batch["pixel_values"],
															
 
																+    # image_sizes = batch["image_sizes"]
															
 
																+#    print("combined_tokens",combined_tokens[image_sizes])
															
 
																+
															
 
																+    return combined_tokens
															
 
																+def image_tokenize(sample, processor):
															
 
																+    processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
															
 
																+    images,sample_text = sample["images"],sample["messages"]
															
 
																+    dialog = []
															
 
																+    for line in sample_text:
															
 
																+        content = []
															
 
																+        messages = line["content"]
															
 
																+        role = line["role"]
															
 
																+        for message in messages:
															
 
																+            if message["type"] == "image":
															
 
																+                content.append({"type": "image"})
															
 
																+            elif message["type"] == "text":
															
 
																+                content.append({"type": "text", "text": message["text"].strip()})
															
 
																+        dialog.append({"role": role,"content":content})
															
 
																+    return tokenize_dialog(dialog,images, processor)
															
 
																+
															
 
																+
															
 
																+def get_custom_dataset(dataset_config, processor, split, split_ratio=0.9):
															
 
																+    # load_dataset will return DatasetDict that contains all the data in the train set
															
 
																+    dataset_dict = load_dataset("remyxai/vqasynth_spacellava")
															
 
																+    dataset = dataset_dict[split]
															
 
																+    dataset = dataset.select(range(100))
															
 
																+    tokenized_datasets = dataset.map(lambda x: image_tokenize(x, processor))
															
 
																+    tokenized_datasets = tokenized_datasets.remove_columns(dataset.column_names)
															
 
																+    return tokenized_datasets
															
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -37,4 +37,4 @@ class custom_dataset:
 
																 class llamaguard_toxicchat_dataset:
															
 
																     dataset: str = "llamaguard_toxicchat_dataset"
															
 
																     train_split: str = "train"
															
 
																-    test_split: str = "test"
															
 
																+    test_split: str = "test"
															
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -22,6 +22,11 @@ from transformers import (
 
																     BitsAndBytesConfig,
															
 
																     LlamaForCausalLM,
															
 
																     LlamaConfig,
															
 
																+    AutoConfig, 
															
 
																+    AutoModel,
															
 
																+    LlavaNextForConditionalGeneration,
															
 
																+    LlavaNextProcessor
															
 
																+
															
 
																 )
															
 
																 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
															
@@ -116,11 +121,11 @@ def main(**kwargs):
 
																         bnb_config = quant_config.create_bnb_config(train_config.quantization)
															
 
																     # Load the pre-trained model and setup its configuration
															
 
																-    use_cache = False if train_config.enable_fsdp else None
															
 
																-    model = LlamaForCausalLM.from_pretrained(
															
 
																+    #use_cache = False if train_config.enable_fsdp else None
															
 
																+    model = LlavaNextForConditionalGeneration.from_pretrained(
															
 
																         train_config.model_name,
															
 
																         quantization_config=bnb_config,
															
 
																-        use_cache=use_cache,
															
 
																+    #    use_cache=use_cache,
															
 
																         attn_implementation="sdpa" if train_config.use_fast_kernels else None,
															
 
																         device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
															
 
																         torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
															
@@ -129,7 +134,8 @@ def main(**kwargs):
 
																     # Load the tokenizer and add special tokens
															
 
																     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
															
 
																     tokenizer.pad_token_id = tokenizer.eos_token_id
															
 
																-
															
 
																+    processor = LlavaNextProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
															
 
																+    processor.tokenizer.padding_side='right'
															
 
																     # If there is a mismatch between tokenizer vocab size and embedding matrix,
															
 
																     # throw a warning and then expand the embedding matrix
															
 
																     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
															
@@ -200,7 +206,7 @@ def main(**kwargs):
 
																      # Load and preprocess the dataset for training and validation
															
 
																     dataset_train = get_preprocessed_dataset(
															
 
																-        tokenizer,
															
 
																+        processor,
															
 
																         dataset_config,
															
 
																         split="train",
															
 
																     )
															
@@ -208,7 +214,7 @@ def main(**kwargs):
 
																         print(f"--> Training Set Length = {len(dataset_train)}")
															
 
																     dataset_val = get_preprocessed_dataset(
															
 
																-        tokenizer,
															
 
																+        processor,
															
 
																         dataset_config,
															
 
																         split="test",
															
 
																     )
															
@@ -219,7 +225,7 @@ def main(**kwargs):
 
																         dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
															
 
																     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, tokenizer, "train")
															
 
																-
															
 
																+    print("length of dataset_train", len(dataset_train))
															
 
																     # Create DataLoaders for the training and validation dataset
															
 
																     train_dataloader = torch.utils.data.DataLoader(
															
 
																         dataset_train,
															
@@ -227,6 +233,7 @@ def main(**kwargs):
 
																         pin_memory=True,
															
 
																         **train_dl_kwargs,
															
 
																     )
															
 
																+    print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
															
 
																     eval_dataloader = None
															
 
																     if train_config.run_validation:
															
@@ -241,6 +248,7 @@ def main(**kwargs):
 
																             pin_memory=True,
															
 
																             **val_dl_kwargs,
															
 
																         )
															
 
																+        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
															
 
																         if len(eval_dataloader) == 0:
															
 
																             raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.")
															
 
																         else:
															
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -104,5 +104,4 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
 
																             kwargs["collate_fn"] = default_data_collator
															
 
																         else:
															
 
																             raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}")
															
 
																-
															
 
																         return kwargs
															
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -118,6 +118,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																     max_steps_reached = False  # Flag to indicate max training steps reached
															
 
																     # Start the training loop
															
 
																     for epoch in range(train_config.num_epochs):
															
 
																+        print(f"Starting epoch {epoch}/{train_config.num_epochs}")
															
 
																+        print(f"train_config.max_train_step: {train_config.max_train_step}")
															
 
																         # stop when the maximum number of training steps is reached
															
 
																         if max_steps_reached:
															
 
																             break
															
@@ -130,6 +132,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																             with profile(train_config,local_rank) as profile_context:
															
 
																                 for step, batch in enumerate(train_dataloader):
															
 
																                     total_train_steps += 1
															
 
																+                    #print("batch: ", batch)
															
 
																                     # stop when the maximum number of training steps is reached
															
 
																                     if train_config.max_train_step > 0 and total_train_steps > train_config.max_train_step:
															
 
																                         max_steps_reached = True
															
@@ -149,8 +152,11 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                             else:
															
 
																                                 batch[key] = batch[key].to('cuda:0')
															
 
																                     with autocast():
															
 
																+                        assert(next(model.parameters()).device == batch['input_ids'].device)
															
 
																+                        #print("batch: ", batch)
															
 
																                         loss = model(**batch).loss
															
 
																                     loss = loss / gradient_accumulation_steps
															
 
																+                    #print("loss",loss)
															
 
																                     if train_config.save_metrics:
															
 
																                         train_step_loss.append(loss.detach().float().item())
															
 
																                         train_step_perplexity.append(float(torch.exp(loss.detach().float())))
															
@@ -171,6 +177,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                             pbar.update(1)
															
 
																                     else:
															
 
																                         # regular backpropagation when fp16 is not used
															
 
																+                        #print("loss123",loss)
															
 
																                         loss.backward()
															
 
																                         if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
															
 
																                             if train_config.gradient_clipping and train_config.gradient_clipping_threshold > 0.0:
															
@@ -243,12 +250,12 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                         print(f"PEFT modules are saved in {train_config.output_dir} directory")
															
 
																                 else:
															
 
																-                    if not train_config.use_peft and fsdp_config.checkpoint_type == StateDictType.FULL_STATE_DICT:
															
 
																+                    if not train_config.use_peft and fsdp_config and fsdp_config.checkpoint_type == StateDictType.FULL_STATE_DICT:
															
 
																                         save_model_checkpoint(
															
 
																                             model, optimizer, rank, train_config, epoch=epoch
															
 
																                         )
															
 
																-                    elif not train_config.use_peft and fsdp_config.checkpoint_type == StateDictType.SHARDED_STATE_DICT:
															
 
																+                    elif not train_config.use_peft and fsdp_config and fsdp_config.checkpoint_type == StateDictType.SHARDED_STATE_DICT:
															
 
																                         print(" Saving the FSDP model checkpoints using SHARDED_STATE_DICT")
															
 
																                         print("=====================================================")