1 年間前 · 57afa0b51e
--- a/recipes/quickstart/finetuning/finetune_vision_model.md
+++ b/recipes/quickstart/finetuning/finetune_vision_model.md
@@ -1,7 +1,7 @@
 
				 ## Fine-Tuning Meta Llama Multi Modal Models recipe
			
 
				 This recipe steps you through how to finetune a Llama 3.2 vision model on the OCR VQA task using the [OCRVQA](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/viewer/ocrvqa?row=0) dataset.
			
 
				 
			
 
				-**Disclaimer**: As our vision models already have a very good OCR ability, here we just use the OCRVQA dataset to demonstrate the steps needed for fine-tuning our vision models.
			
 
				+**Disclaimer**: As our vision models already have a very good OCR ability, here we just use the OCRVQA dataset only for demonstration purposes of the required steps for fine-tuning our vision models with llama-recipes.
			
 
				 
			
 
				 ### Fine-tuning steps
			
 
				 
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -20,9 +20,9 @@ from transformers import (
 
				     AutoConfig,
			
 
				     AutoTokenizer,
			
 
				     BitsAndBytesConfig,
			
 
				-    LlamaForCausalLM,
			
 
				     AutoProcessor, 
			
 
				-    MllamaForConditionalGeneration
			
 
				+    MllamaForConditionalGeneration,
			
 
				+    AutoModel,
			
 
				 )
			
 
				 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
			
 
				 from transformers.models.mllama.modeling_mllama import  MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
			
@@ -134,7 +134,7 @@ def main(**kwargs):
 
				         processor.tokenizer.padding_side='right'
			
 
				     elif config.model_type == "llama":
			
 
				         is_vision = False
			
 
				-        model = LlamaForCausalLM.from_pretrained(
			
 
				+        model = AutoModel.from_pretrained(
			
 
				             train_config.model_name,
			
 
				             quantization_config=bnb_config,
			
 
				             use_cache=use_cache,