Browse Source

changed dataset to ocrvqa

Kai Wu 8 months ago
parent
commit
c18a0d277f

+ 5 - 4
recipes/quickstart/finetuning/datasets/vqa_dataset.py

@@ -48,18 +48,19 @@ def tokenize_dialogs(dialogs, images, processor):
                 labels[i] = -100
         label_list.append(labels)
     batch["labels"] = torch.tensor(label_list)
-    tokenizer_length = len(processor.tokenizer)
     return batch
 
 
 def get_custom_dataset(dataset_config, processor, split, split_ratio=0.9):
     # load_dataset will return DatasetDict that contains all the data in the train set
-    dataset_dict = load_dataset("HuggingFaceM4/the_cauldron", name="ai2d")
+    dataset_dict = load_dataset("HuggingFaceM4/the_cauldron", name="ocrvqa")
     dataset = dataset_dict['train']
+    # Comment out the following line to use the full dataset, for quick testing only use 2000 samples
+    dataset = dataset.select(range(2000))
     dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)[split]
     return dataset
 
-class VQADataCollator:
+class OCRVQADataCollator:
     def __init__(self, processor):
         self.processor = processor
         self.processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
@@ -88,4 +89,4 @@ class VQADataCollator:
             images.append([image])
         return tokenize_dialogs(dialogs,images, self.processor)
 def get_data_collator(processor):
-    return VQADataCollator(processor)
+    return OCRVQADataCollator(processor)

File diff suppressed because it is too large
+ 12 - 16
recipes/quickstart/finetuning/finetune_vision_model.md


+ 2 - 7
src/llama_recipes/finetuning.py

@@ -14,11 +14,6 @@ from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
     ShardingStrategy
 )
-from torch.distributed.fsdp.wrap import (
-    always_wrap_policy,
-    ModuleWrapPolicy,
-    transformer_auto_wrap_policy,
-)
 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 from torch.optim.lr_scheduler import StepLR
 from transformers import (
@@ -26,7 +21,6 @@ from transformers import (
     AutoTokenizer,
     BitsAndBytesConfig,
     LlamaForCausalLM,
-    LlamaConfig,
     AutoProcessor, 
     MllamaForConditionalGeneration
 )
@@ -152,7 +146,8 @@ def main(**kwargs):
         raise ValueError(f"Model type {config.model_type} is not supported. Please use llama or mllama model.")
     # Load the tokenizer and add special tokens
     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
-    tokenizer.pad_token_id = tokenizer.eos_token_id
+    if not tokenizer.pad_token_id: 
+        tokenizer.pad_token_id = tokenizer.eos_token_id
         
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix

+ 0 - 1
src/llama_recipes/utils/config_utils.py

@@ -14,7 +14,6 @@ from peft import (
 )
 from transformers import default_data_collator
 from transformers.data import DataCollatorForSeq2Seq
-from functools import partial
 
 from llama_recipes.configs import datasets, lora_config, llama_adapter_config, prefix_config, train_config
 from llama_recipes.data.sampler import LengthBasedBatchSampler, DistributedLengthBasedBatchSampler

+ 0 - 1
src/llama_recipes/utils/train_utils.py

@@ -360,7 +360,6 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer, wandb
             # Ensure no gradients are computed for this scope to save memory
             with torch.no_grad():
                 # Forward pass and compute loss
-                #outputs = model(**batch,use_cache=False)
                 outputs = model(**batch)
                 loss = outputs.loss
                 if train_config.save_metrics: