1 gadu atpakaļ · 526ee789cc
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1522,3 +1522,7 @@ globals
 
				 gmail
			
 
				 multiagent
			
 
				 yyy
			
 
				+jpeg
			
 
				+toend
			
 
				+codellama
			
 
				+DIFFLOG
			
--- a/getting-started/finetuning/datasets/ocrvqa_dataset.py
+++ b/getting-started/finetuning/datasets/ocrvqa_dataset.py
@@ -3,46 +3,60 @@
 
				 
			
 
				 
			
 
				 import copy
			
 
				-from datasets import load_dataset
			
 
				 import itertools
			
 
				+
			
 
				 import torch
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				 
			
 
				 # check system prompt token seq or user prompt token seq is in the current token list
			
 
				-def check_header(targets,seq):
			
 
				-    for i in range(len(seq)-3):
			
 
				-        if seq[i:i+3] in targets:
			
 
				+def check_header(targets, seq):
			
 
				+    for i in range(len(seq) - 3):
			
 
				+        if seq[i : i + 3] in targets:
			
 
				             return True
			
 
				     return False
			
 
				-def replace_target(target,seq):
			
 
				-    for i in range(len(seq)-3):
			
 
				-        if seq[i:i+3] == target:
			
 
				-            seq[i],seq[i+1],seq[i+2] = -100,-100,-100
			
 
				+
			
 
				+
			
 
				+def replace_target(target, seq):
			
 
				+    for i in range(len(seq) - 3):
			
 
				+        if seq[i : i + 3] == target:
			
 
				+            seq[i], seq[i + 1], seq[i + 2] = -100, -100, -100
			
 
				     return seq
			
 
				+
			
 
				+
			
 
				 def tokenize_dialogs(dialogs, images, processor):
			
 
				     text_prompt = processor.apply_chat_template(dialogs)
			
 
				-    batch = processor(images=images, text=text_prompt,padding = True, return_tensors="pt")
			
 
				+    text_prompt = [prompt.replace('<|begin_of_text|>','') for prompt in text_prompt]
			
 
				+    batch = processor(
			
 
				+        images=images,
			
 
				+        text=text_prompt,
			
 
				+        padding=True,
			
 
				+        return_tensors="pt",
			
 
				+    )
			
 
				     label_list = []
			
 
				     for i in range(len(batch["input_ids"])):
			
 
				         dialog_tokens = batch["input_ids"][i].tolist()
			
 
				         labels = copy.copy(dialog_tokens)
			
 
				-        eot_indices = [i for i,n in enumerate(labels) if n == 128009]
			
 
				+        eot_indices = [i for i, n in enumerate(labels) if n == 128009]
			
 
				         last_idx = 0
			
 
				         # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
			
 
				         # user prompt header "<|start_header_id|>user<|end_header_id|>" has been tokenized to [128006, 882, 128007]
			
 
				-        prompt_header_seqs = [[128006, 9125, 128007],[128006, 882, 128007]]
			
 
				+        prompt_header_seqs = [[128006, 9125, 128007], [128006, 882, 128007]]
			
 
				         for n, idx in enumerate(eot_indices):
			
 
				-            current_seq = labels[last_idx:idx+1]
			
 
				-            if check_header(prompt_header_seqs,current_seq):
			
 
				+            current_seq = labels[last_idx : idx + 1]
			
 
				+            if check_header(prompt_header_seqs, current_seq):
			
 
				                 # found prompt header, indicating that this seq should be masked
			
 
				-                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
			
 
				+                labels[last_idx : idx + 1] = [-100] * (idx - last_idx + 1)
			
 
				             else:
			
 
				-                last_idx = idx+1
			
 
				+                last_idx = idx + 1
			
 
				             #  Mask all the assistant header prompt <|start_header_id|>assistant<|end_header_id|>, which has been tokenized to [128006, 78191, 128007]
			
 
				         assistant_header_seq = [128006, 78191, 128007]
			
 
				-        labels = replace_target(assistant_header_seq,labels)
			
 
				-        # Mask the padding token and image token 128256 
			
 
				+        labels = replace_target(assistant_header_seq, labels)
			
 
				+        # Mask the padding token and image token 128256
			
 
				         for i in range(len(labels)):
			
 
				-            if labels[i] == processor.tokenizer.pad_token_id or labels[i] == 128256: #  128256 is image token index
			
 
				+            if (
			
 
				+                labels[i] == processor.tokenizer.pad_token_id or labels[i] == 128256
			
 
				+            ):  #  128256 is image token index
			
 
				                 labels[i] = -100
			
 
				         label_list.append(labels)
			
 
				     batch["labels"] = torch.tensor(label_list)
			
@@ -52,39 +66,74 @@ def tokenize_dialogs(dialogs, images, processor):
 
				 def get_custom_dataset(dataset_config, processor, split, split_ratio=0.9):
			
 
				     # load_dataset will return DatasetDict that contains all the data in the train set
			
 
				     dataset_dict = load_dataset("HuggingFaceM4/the_cauldron", name="ocrvqa")
			
 
				-    dataset = dataset_dict['train']
			
 
				+    dataset = dataset_dict["train"]
			
 
				     # Comment out the following line to use the full dataset, for quick testing only use 2000 samples
			
 
				     dataset = dataset.select(range(2000))
			
 
				-    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)[split]
			
 
				+    dataset = dataset.train_test_split(
			
 
				+        test_size=1 - split_ratio, shuffle=True, seed=42
			
 
				+    )[split]
			
 
				     return dataset
			
 
				 
			
 
				+
			
 
				 class OCRVQADataCollator:
			
 
				     def __init__(self, processor):
			
 
				         self.processor = processor
			
 
				-        self.processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
			
 
				+        self.processor.tokenizer.padding_side = (
			
 
				+            "right"  # during training, one always uses padding on the right
			
 
				+        )
			
 
				+
			
 
				     def __call__(self, samples):
			
 
				-        dialogs,images = [],[]
			
 
				+        dialogs, images = [], []
			
 
				         for sample in samples:
			
 
				-            image_list,sample_list = sample["images"],sample["texts"]
			
 
				+            image_list, sample_list = sample["images"], sample["texts"]
			
 
				             if len(image_list) > 1:
			
 
				                 raise ValueError("Only support one image per sample")
			
 
				-            image = image_list[0].convert("RGB") # only use the first image
			
 
				+            image = image_list[0].convert("RGB")  # only use the first image
			
 
				             dialog = []
			
 
				             for sample_dict in sample_list:
			
 
				                 if not dialog:
			
 
				                     # only append image to the first sentence
			
 
				                     dialog += [
			
 
				-                    {"role":"user","content":[{"type": "image"},{"type": "text", "text": sample_dict["user"].strip()}]},
			
 
				-                    {"role":"assistant","content":[{"type": "text", "text": sample_dict["assistant"].strip()}]}
			
 
				-                ]
			
 
				-                
			
 
				+                        {
			
 
				+                            "role": "user",
			
 
				+                            "content": [
			
 
				+                                {"type": "image"},
			
 
				+                                {"type": "text", "text": sample_dict["user"].strip()},
			
 
				+                            ],
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "role": "assistant",
			
 
				+                            "content": [
			
 
				+                                {
			
 
				+                                    "type": "text",
			
 
				+                                    "text": sample_dict["assistant"].strip(),
			
 
				+                                }
			
 
				+                            ],
			
 
				+                        },
			
 
				+                    ]
			
 
				+
			
 
				                 else:
			
 
				                     dialog += [
			
 
				-                    {"role":"user","content":[{"type": "text", "text": sample_dict["user"].strip()}]},
			
 
				-                    {"role":"assistant","content":[{"type": "text", "text": sample_dict["assistant"].strip()}]}
			
 
				-                ]
			
 
				+                        {
			
 
				+                            "role": "user",
			
 
				+                            "content": [
			
 
				+                                {"type": "text", "text": sample_dict["user"].strip()}
			
 
				+                            ],
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "role": "assistant",
			
 
				+                            "content": [
			
 
				+                                {
			
 
				+                                    "type": "text",
			
 
				+                                    "text": sample_dict["assistant"].strip(),
			
 
				+                                }
			
 
				+                            ],
			
 
				+                        },
			
 
				+                    ]
			
 
				             dialogs.append(dialog)
			
 
				             images.append([image])
			
 
				-        return tokenize_dialogs(dialogs,images, self.processor)
			
 
				+        return tokenize_dialogs(dialogs, images, self.processor)
			
 
				+
			
 
				+
			
 
				 def get_data_collator(processor):
			
 
				     return OCRVQADataCollator(processor)
			
--- a/getting-started/inference/local_inference/multi_modal_infer.py
+++ b/getting-started/inference/local_inference/multi_modal_infer.py
@@ -1,13 +1,15 @@
 
				 import argparse
			
 
				 import os
			
 
				 import sys
			
 
				+
			
 
				+import gradio as gr
			
 
				 import torch
			
 
				 from accelerate import Accelerator
			
 
				+from huggingface_hub import HfFolder
			
 
				+from peft import PeftModel
			
 
				 from PIL import Image as PIL_Image
			
 
				 from transformers import MllamaForConditionalGeneration, MllamaProcessor
			
 
				-from peft import PeftModel
			
 
				-import gradio as gr
			
 
				-from huggingface_hub import HfFolder
			
 
				+
			
 
				 # Initialize accelerator
			
 
				 accelerator = Accelerator()
			
 
				 device = accelerator.device
			
@@ -43,24 +45,24 @@ def load_model_and_processor(model_name: str, finetuning_path: str = None):
 
				         torch_dtype=torch.bfloat16,
			
 
				         use_safetensors=True,
			
 
				         device_map=device,
			
 
				-        token=hf_token
			
 
				+        token=hf_token,
			
 
				+    )
			
 
				+    processor = MllamaProcessor.from_pretrained(
			
 
				+        model_name, token=hf_token, use_safetensors=True
			
 
				     )
			
 
				-    processor = MllamaProcessor.from_pretrained(model_name, token=hf_token, use_safetensors=True)
			
 
				 
			
 
				     if finetuning_path and os.path.exists(finetuning_path):
			
 
				         print(f"Loading LoRA adapter from '{finetuning_path}'...")
			
 
				         model = PeftModel.from_pretrained(
			
 
				-            model,
			
 
				-            finetuning_path,
			
 
				-            is_adapter=True,
			
 
				-            torch_dtype=torch.bfloat16
			
 
				+            model, finetuning_path, is_adapter=True, torch_dtype=torch.bfloat16
			
 
				         )
			
 
				         print("LoRA adapter merged successfully")
			
 
				-    
			
 
				+
			
 
				     model, processor = accelerator.prepare(model, processor)
			
 
				     return model, processor
			
 
				 
			
 
				-def process_image(image_path: str = None, image = None) -> PIL_Image.Image:
			
 
				+
			
 
				+def process_image(image_path: str = None, image=None) -> PIL_Image.Image:
			
 
				     """Process and validate image input"""
			
 
				     if image is not None:
			
 
				         return image.convert("RGB")
			
@@ -68,29 +70,44 @@ def process_image(image_path: str = None, image = None) -> PIL_Image.Image:
 
				         return PIL_Image.open(image_path).convert("RGB")
			
 
				     raise ValueError("No valid image provided")
			
 
				 
			
 
				-def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
			
 
				+
			
 
				+def generate_text_from_image(
			
 
				+    model, processor, image, prompt_text: str, temperature: float, top_p: float
			
 
				+):
			
 
				     """Generate text from image using model"""
			
 
				     conversation = [
			
 
				-        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": [{"type": "image"}, {"type": "text", "text": prompt_text}],
			
 
				+        }
			
 
				     ]
			
 
				-    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
			
 
				-    inputs = processor(image, prompt, return_tensors="pt").to(device)
			
 
				-    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS)
			
 
				-    return processor.decode(output[0])[len(prompt):]
			
 
				+    prompt = processor.apply_chat_template(
			
 
				+        conversation, add_generation_prompt=True, tokenize=False
			
 
				+    )
			
 
				+    inputs = processor(
			
 
				+        image, prompt, text_kwargs={"add_special_tokens": False}, return_tensors="pt"
			
 
				+    ).to(device)
			
 
				+    print("Input Prompt:\n", processor.tokenizer.decode(inputs.input_ids[0]))
			
 
				+    output = model.generate(
			
 
				+        **inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS
			
 
				+    )
			
 
				+    return processor.decode(output[0])[len(prompt) :]
			
 
				+
			
 
				 
			
 
				 def gradio_interface(model_name: str):
			
 
				     """Create Gradio UI with LoRA support"""
			
 
				     # Initialize model state
			
 
				     current_model = {"model": None, "processor": None}
			
 
				-    
			
 
				+
			
 
				     def load_or_reload_model(enable_lora: bool, lora_path: str = None):
			
 
				         current_model["model"], current_model["processor"] = load_model_and_processor(
			
 
				-            model_name, 
			
 
				-            lora_path if enable_lora else None
			
 
				+            model_name, lora_path if enable_lora else None
			
 
				         )
			
 
				         return "Model loaded successfully" + (" with LoRA" if enable_lora else "")
			
 
				 
			
 
				-    def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
			
 
				+    def describe_image(
			
 
				+        image, user_prompt, temperature, top_k, top_p, max_tokens, history
			
 
				+    ):
			
 
				         if image is not None:
			
 
				             try:
			
 
				                 processed_image = process_image(image=image)
			
@@ -100,7 +117,7 @@ def gradio_interface(model_name: str):
 
				                     processed_image,
			
 
				                     user_prompt,
			
 
				                     temperature,
			
 
				-                    top_p
			
 
				+                    top_p,
			
 
				                 )
			
 
				                 history.append((user_prompt, result))
			
 
				             except Exception as e:
			
@@ -112,7 +129,7 @@ def gradio_interface(model_name: str):
 
				 
			
 
				     with gr.Blocks() as demo:
			
 
				         gr.HTML("<h1 style='text-align: center'>Llama Vision Model Interface</h1>")
			
 
				-        
			
 
				+
			
 
				         with gr.Row():
			
 
				             with gr.Column(scale=1):
			
 
				                 # Model loading controls
			
@@ -121,58 +138,74 @@ def gradio_interface(model_name: str):
 
				                     lora_path = gr.Textbox(
			
 
				                         label="LoRA Weights Path",
			
 
				                         placeholder="Path to LoRA weights folder",
			
 
				-                        visible=False
			
 
				+                        visible=False,
			
 
				                     )
			
 
				                     load_status = gr.Textbox(label="Load Status", interactive=False)
			
 
				                     load_button = gr.Button("Load/Reload Model")
			
 
				 
			
 
				                 # Image and parameter controls
			
 
				-                image_input = gr.Image(label="Image", type="pil", image_mode="RGB", height=512, width=512)
			
 
				-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1)
			
 
				-                top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1)
			
 
				-                top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1)
			
 
				-                max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50)
			
 
				+                image_input = gr.Image(
			
 
				+                    label="Image", type="pil", image_mode="RGB", height=512, width=512
			
 
				+                )
			
 
				+                temperature = gr.Slider(
			
 
				+                    label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1
			
 
				+                )
			
 
				+                top_k = gr.Slider(
			
 
				+                    label="Top-k", minimum=1, maximum=100, value=50, step=1
			
 
				+                )
			
 
				+                top_p = gr.Slider(
			
 
				+                    label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1
			
 
				+                )
			
 
				+                max_tokens = gr.Slider(
			
 
				+                    label="Max Tokens",
			
 
				+                    minimum=50,
			
 
				+                    maximum=MAX_OUTPUT_TOKENS,
			
 
				+                    value=100,
			
 
				+                    step=50,
			
 
				+                )
			
 
				 
			
 
				             with gr.Column(scale=2):
			
 
				                 chat_history = gr.Chatbot(label="Chat", height=512)
			
 
				                 user_prompt = gr.Textbox(
			
 
				-                    show_label=False,
			
 
				-                    placeholder="Enter your prompt",
			
 
				-                    lines=2
			
 
				+                    show_label=False, placeholder="Enter your prompt", lines=2
			
 
				                 )
			
 
				-                
			
 
				+
			
 
				                 with gr.Row():
			
 
				                     generate_button = gr.Button("Generate")
			
 
				                     clear_button = gr.Button("Clear")
			
 
				 
			
 
				         # Event handlers
			
 
				         enable_lora.change(
			
 
				-            fn=lambda x: gr.update(visible=x),
			
 
				-            inputs=[enable_lora],
			
 
				-            outputs=[lora_path]
			
 
				+            fn=lambda x: gr.update(visible=x), inputs=[enable_lora], outputs=[lora_path]
			
 
				         )
			
 
				-        
			
 
				+
			
 
				         load_button.click(
			
 
				             fn=load_or_reload_model,
			
 
				             inputs=[enable_lora, lora_path],
			
 
				-            outputs=[load_status]
			
 
				+            outputs=[load_status],
			
 
				         )
			
 
				 
			
 
				         generate_button.click(
			
 
				             fn=describe_image,
			
 
				             inputs=[
			
 
				-                image_input, user_prompt, temperature,
			
 
				-                top_k, top_p, max_tokens, chat_history
			
 
				+                image_input,
			
 
				+                user_prompt,
			
 
				+                temperature,
			
 
				+                top_k,
			
 
				+                top_p,
			
 
				+                max_tokens,
			
 
				+                chat_history,
			
 
				             ],
			
 
				-            outputs=[chat_history]
			
 
				+            outputs=[chat_history],
			
 
				         )
			
 
				-        
			
 
				+
			
 
				         clear_button.click(fn=clear_chat, outputs=[chat_history])
			
 
				 
			
 
				     # Initial model load
			
 
				     load_or_reload_model(False)
			
 
				     return demo
			
 
				 
			
 
				+
			
 
				 def main(args):
			
 
				     """Main execution flow"""
			
 
				     if args.gradio_ui:
			
@@ -180,27 +213,30 @@ def main(args):
 
				         demo.launch()
			
 
				     else:
			
 
				         model, processor = load_model_and_processor(
			
 
				-            args.model_name,
			
 
				-            args.finetuning_path
			
 
				+            args.model_name, args.finetuning_path
			
 
				         )
			
 
				         image = process_image(image_path=args.image_path)
			
 
				         result = generate_text_from_image(
			
 
				-            model, processor, image,
			
 
				-            args.prompt_text,
			
 
				-            args.temperature,
			
 
				-            args.top_p
			
 
				+            model, processor, image, args.prompt_text, args.temperature, args.top_p
			
 
				         )
			
 
				         print("Generated Text:", result)
			
 
				 
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser(description="Multi-modal inference with optional Gradio UI and LoRA support")
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Multi-modal inference with optional Gradio UI and LoRA support"
			
 
				+    )
			
 
				     parser.add_argument("--image_path", type=str, help="Path to the input image")
			
 
				     parser.add_argument("--prompt_text", type=str, help="Prompt text for the image")
			
 
				-    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
			
 
				+    parser.add_argument(
			
 
				+        "--temperature", type=float, default=0.7, help="Sampling temperature"
			
 
				+    )
			
 
				     parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling")
			
 
				-    parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name")
			
 
				+    parser.add_argument(
			
 
				+        "--model_name", type=str, default=DEFAULT_MODEL, help="Model name"
			
 
				+    )
			
 
				     parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights")
			
 
				     parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI")
			
 
				-    
			
 
				+
			
 
				     args = parser.parse_args()
			
 
				-    main(args)
			
 
				+    main(args)