1 周之前 · ae3f0269e9
--- a/getting-started/finetuning/vision/11B_full_w2.yaml
+++ b/getting-started/finetuning/vision/11B_full_w2.yaml
@@ -0,0 +1,100 @@
 
				+# Top-level output directory
			
 
				+output_dir: ./outputs/Llama-3.2-11B-Instruct-w2-full
			
 
				+
			
 
				+# Model
			
 
				+model:
			
 
				+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
			
 
				+  decoder_trainable: True
			
 
				+  encoder_trainable: True
			
 
				+  fusion_trainable: True
			
 
				+  image_size: 560 # Make sure this matches the image_size in tokenizer
			
 
				+
			
 
				+# Tokenizer / vision transform
			
 
				+tokenizer:
			
 
				+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
			
 
				+  path: ./Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
			
 
				+  image_size: 560
			
 
				+  max_seq_len: 8192
			
 
				+
			
 
				+# Checkpointing
			
 
				+checkpointer:
			
 
				+  _component_: torchtune.training.FullModelHFCheckpointer
			
 
				+  checkpoint_dir: ./Llama-3.2-11B-Vision-Instruct
			
 
				+  checkpoint_files:
			
 
				+    filename_format: model-{}-of-{}.safetensors
			
 
				+    max_filename: "00005"
			
 
				+  recipe_checkpoint: null
			
 
				+  output_dir: ${output_dir}
			
 
				+  model_type: LLAMA3_VISION
			
 
				+
			
 
				+resume_from_checkpoint: false
			
 
				+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
			
 
				+
			
 
				+# Dataset
			
 
				+dataset:
			
 
				+  _component_: torchtune.datasets.multimodal.vqa_dataset
			
 
				+  source: arrow
			
 
				+  data_files:
			
 
				+    train: "fake_w2_us_tax_form_dataset_train30_test70/train/data-00000-of-00001.arrow"
			
 
				+  split: train
			
 
				+  column_map:
			
 
				+    input: input
			
 
				+    output: ground_truth
			
 
				+    image: image
			
 
				+
			
 
				+# General data handling
			
 
				+seed: null
			
 
				+shuffle: true
			
 
				+collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
			
 
				+
			
 
				+# Training loop & hyperparams
			
 
				+
			
 
				+epochs: 5
			
 
				+max_steps_per_epoch: null
			
 
				+batch_size: 4
			
 
				+gradient_accumulation_steps: 8 # Use to increase effective batch size
			
 
				+# explicit optimizer / scheduler / loss
			
 
				+optimizer:
			
 
				+  _component_: bitsandbytes.optim.PagedAdamW8bit
			
 
				+  lr: 2e-5
			
 
				+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
			
 
				+
			
 
				+loss:
			
 
				+  _component_: torchtune.modules.loss.LinearCrossEntropyLoss
			
 
				+
			
 
				+clip_grad_norm: 1.0
			
 
				+compile: false
			
 
				+
			
 
				+# Device & memory
			
 
				+device: cuda
			
 
				+enable_activation_checkpointing: true
			
 
				+dtype: bf16
			
 
				+
			
 
				+# Logging
			
 
				+
			
 
				+metric_logger:
			
 
				+  _component_: torchtune.training.metric_logging.WandBLogger
			
 
				+  project: llama3_2_w2_extraction
			
 
				+  entity: <your_wandb_entity>
			
 
				+  job_type: full_finetune_single_device
			
 
				+  group: llama-cookbook
			
 
				+log_every_n_steps: 5
			
 
				+save_steps: 100
			
 
				+log_peak_memory_stats: true
			
 
				+log_level: INFO
			
 
				+
			
 
				+# Profiler (off by default)
			
 
				+profiler:
			
 
				+  _component_: torchtune.training.setup_torch_profiler
			
 
				+  enabled: false
			
 
				+  output_dir: ${output_dir}/profiling_outputs
			
 
				+  cpu: true
			
 
				+  cuda: true
			
 
				+  profile_memory: false
			
 
				+  with_stack: false
			
 
				+  record_shapes: true
			
 
				+  with_flops: false
			
 
				+  wait_steps: 5
			
 
				+  warmup_steps: 3
			
 
				+  active_steps: 2
			
 
				+  num_cycles: 1
			
--- a/getting-started/finetuning/vision/11B_lora_w2.yaml
+++ b/getting-started/finetuning/vision/11B_lora_w2.yaml
@@ -0,0 +1,118 @@
 
				+# Top-level output directory
			
 
				+output_dir: ./outputs/Llama-3.2-11B-Instruct-w2-lora-80
			
 
				+
			
 
				+# Model + LoRA settings
			
 
				+model:
			
 
				+  _component_: torchtune.models.llama3_2_vision.lora_llama3_2_vision_11b
			
 
				+  # preserve your hyperparams
			
 
				+  lora_rank: 8 # higher increases accuracy and memory
			
 
				+  lora_alpha: 16 # usually alpha=2*rank
			
 
				+  lora_dropout: 0.05
			
 
				+  image_size: 560 # Make sure this matches the image_size in tokenizer
			
 
				+  # example’s fixed settings
			
 
				+  decoder_trainable: "frozen"
			
 
				+  encoder_trainable: "lora"
			
 
				+  fusion_trainable: "lora"
			
 
				+  lora_attn_modules:
			
 
				+    - 'q_proj'
			
 
				+    - 'v_proj'
			
 
				+    - 'output_proj'
			
 
				+  apply_lora_to_mlp: true
			
 
				+  apply_lora_to_output: false
			
 
				+
			
 
				+# Tokenizer / vision transform
			
 
				+tokenizer:
			
 
				+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
			
 
				+  path: ./Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
			
 
				+  image_size: 560
			
 
				+  max_seq_len: 8192
			
 
				+
			
 
				+# Checkpointing
			
 
				+checkpointer:
			
 
				+  _component_: torchtune.training.FullModelHFCheckpointer
			
 
				+  checkpoint_dir: ./Llama-3.2-11B-Vision-Instruct
			
 
				+  checkpoint_files:
			
 
				+    filename_format: model-{}-of-{}.safetensors
			
 
				+    max_filename: "00005"
			
 
				+  recipe_checkpoint: null
			
 
				+  output_dir: ${output_dir}
			
 
				+  model_type: LLAMA3_VISION
			
 
				+
			
 
				+resume_from_checkpoint: false
			
 
				+save_adapter_weights_only: false # PeFT formatting not available yet. This will save it in torchtune format only.
			
 
				+
			
 
				+# Dataset
			
 
				+dataset:
			
 
				+  _component_: torchtune.datasets.multimodal.vqa_dataset
			
 
				+  source: arrow
			
 
				+  data_files:
			
 
				+    # train: "w2_with_input/train/data-00000-of-00001.arrow"
			
 
				+    train: "fake_w2_us_tax_form_dataset_train80_test20/train/data-00000-of-00001.arrow"
			
 
				+  split: train
			
 
				+  column_map:
			
 
				+    input: input
			
 
				+    output: ground_truth
			
 
				+    image: image
			
 
				+
			
 
				+# General data handling
			
 
				+seed: null
			
 
				+shuffle: true
			
 
				+collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
			
 
				+
			
 
				+# Training loop & hyperparams
			
 
				+
			
 
				+# example’s train-control
			
 
				+epochs: 10
			
 
				+max_steps_per_epoch: null
			
 
				+batch_size: 4
			
 
				+gradient_accumulation_steps: 8 # Use to increase effective batch size
			
 
				+# explicit optimizer / scheduler / loss
			
 
				+optimizer:
			
 
				+  _component_: torch.optim.AdamW
			
 
				+  fused: true
			
 
				+  weight_decay: 0.01
			
 
				+  lr: 1e-4
			
 
				+
			
 
				+lr_scheduler:
			
 
				+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
			
 
				+  num_warmup_steps: 100
			
 
				+
			
 
				+loss:
			
 
				+  _component_: torchtune.modules.loss.LinearCrossEntropyLoss
			
 
				+
			
 
				+clip_grad_norm: 1.0
			
 
				+compile: false
			
 
				+
			
 
				+# Device & memory
			
 
				+device: cuda
			
 
				+enable_activation_checkpointing: true
			
 
				+dtype: bf16
			
 
				+
			
 
				+# Logging
			
 
				+
			
 
				+metric_logger:
			
 
				+  _component_: torchtune.training.metric_logging.WandBLogger
			
 
				+  project: llama3_2_w2_extraction
			
 
				+  entity: <your_wandb_entity>
			
 
				+  job_type: lora_finetune_single_device
			
 
				+  group: llama-cookbook
			
 
				+log_every_n_steps: 5
			
 
				+save_steps: 100
			
 
				+log_peak_memory_stats: true
			
 
				+log_level: INFO
			
 
				+
			
 
				+# Profiler (off by default)
			
 
				+profiler:
			
 
				+  _component_: torchtune.training.setup_torch_profiler
			
 
				+  enabled: false
			
 
				+  output_dir: ${output_dir}/profiling_outputs
			
 
				+  cpu: true
			
 
				+  cuda: true
			
 
				+  profile_memory: false
			
 
				+  with_stack: false
			
 
				+  record_shapes: true
			
 
				+  with_flops: false
			
 
				+  wait_steps: 5
			
 
				+  warmup_steps: 3
			
 
				+  active_steps: 2
			
 
				+  num_cycles: 1
			
--- a/getting-started/finetuning/vision/evaluate.py
+++ b/getting-started/finetuning/vision/evaluate.py
@@ -0,0 +1,793 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Script to evaluate a vision-language model on the W2 tax form dataset using compatible API client.
			
 
				+Leverages the OpenAI-compatible SDK for various endpoints, like vLLM server, Llama API, or any compatible API.
			
 
				+Support batch processing.
			
 
				+Loads images from the provided dataset, sends them to the compatible API server,
			
 
				+and compares with the expected output.
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import base64
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import pathlib
			
 
				+import re
			
 
				+import time
			
 
				+import traceback
			
 
				+from concurrent.futures import as_completed, ThreadPoolExecutor
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+from datasets import load_dataset, load_from_disk
			
 
				+from openai import OpenAI
			
 
				+from PIL import Image
			
 
				+from pydantic import BaseModel
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+# Set up logging
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class W2Form(BaseModel):
			
 
				+    box_b_employer_identification_number: str
			
 
				+    box_c_employer_name: str
			
 
				+    box_c_employer_street_address: str
			
 
				+    box_c_employer_city_state_zip: str
			
 
				+    box_a_employee_ssn: str
			
 
				+    box_e_employee_name: str
			
 
				+    box_e_employee_street_address: str
			
 
				+    box_e_employee_city_state_zip: str
			
 
				+    box_d_control_number: int
			
 
				+    box_1_wages: float
			
 
				+    box_2_federal_tax_withheld: float
			
 
				+    box_3_social_security_wages: float
			
 
				+    box_4_social_security_tax_withheld: float
			
 
				+    box_5_medicare_wages: float
			
 
				+    box_6_medicare_wages_tax_withheld: float
			
 
				+    box_7_social_security_tips: float
			
 
				+    box_8_allocated_tips: float
			
 
				+    box_9_advance_eic_payment: Optional[str]
			
 
				+    box_10_dependent_care_benefits: float
			
 
				+    box_11_nonqualified_plans: float
			
 
				+    box_12a_code: str
			
 
				+    box_12a_value: float
			
 
				+    box_12b_code: str
			
 
				+    box_12b_value: float
			
 
				+    box_12c_code: str
			
 
				+    box_12c_value: float
			
 
				+    box_12d_code: Optional[str]
			
 
				+    box_12d_value: float
			
 
				+    box_13_statutary_employee: Optional[str]
			
 
				+    box_13_retirement_plan: Optional[str]
			
 
				+    box_13_third_part_sick_pay: Optional[str]
			
 
				+    box_15_1_state: str
			
 
				+    box_15_1_employee_state_id: str
			
 
				+    box_16_1_state_wages: float
			
 
				+    box_17_1_state_income_tax: float
			
 
				+    box_18_1_local_wages: float
			
 
				+    box_19_1_local_income_tax: float
			
 
				+    box_20_1_locality: str
			
 
				+    box_15_2_state: str
			
 
				+    box_15_2_employee_state_id: str
			
 
				+    box_16_2_state_wages: float
			
 
				+    box_17_2_state_income_tax: float
			
 
				+    box_18_2_local_wages: float
			
 
				+    box_19_2_local_income_tax: float
			
 
				+    box_20_2_locality: str
			
 
				+
			
 
				+
			
 
				+# ----------- Utilities -----------
			
 
				+def encode_image_to_base64(image_path: str) -> str:
			
 
				+    """Encode image to base64 string."""
			
 
				+    with open(image_path, "rb") as f:
			
 
				+        return base64.b64encode(f.read()).decode()
			
 
				+
			
 
				+
			
 
				+def create_messages(prompt: str, image_path: str) -> List[Dict]:
			
 
				+    """Create messages array for API client call."""
			
 
				+    content = [
			
 
				+        {"type": "text", "text": prompt},
			
 
				+        {
			
 
				+            "type": "image_url",
			
 
				+            "image_url": {
			
 
				+                "url": f"data:image/png;base64,{encode_image_to_base64(image_path)}"
			
 
				+            },
			
 
				+        },
			
 
				+    ]
			
 
				+    return [{"role": "user", "content": content}]
			
 
				+
			
 
				+
			
 
				+def clean_json_string(json_str: str) -> str:
			
 
				+    """
			
 
				+    Clean common JSON formatting issues from LLM responses.
			
 
				+
			
 
				+    Args:
			
 
				+        json_str: Raw JSON string that may contain formatting issues
			
 
				+
			
 
				+    Returns:
			
 
				+        Cleaned JSON string
			
 
				+    """
			
 
				+    # Remove markdown code block markers
			
 
				+    json_str = re.sub(r"```(?:json)?\s*", "", json_str)
			
 
				+    json_str = re.sub(r"\s*```", "", json_str)
			
 
				+
			
 
				+    # Fix malformed string patterns like: "field": ",\n" ,
			
 
				+    # This handles the specific error case where strings are malformed with newlines
			
 
				+    json_str = re.sub(r':\s*",\s*"\s*,', ': "",', json_str)
			
 
				+
			
 
				+    # Fix incomplete string literals with control characters
			
 
				+    # Pattern: "field": "partial_value\nrest_of_value",
			
 
				+    json_str = re.sub(r':\s*"([^"]*)\n([^"]*)",', r': "\1\2",', json_str)
			
 
				+
			
 
				+    # Fix the specific pattern from the error: "field": "value\n" followed by whitespace and comma
			
 
				+    json_str = re.sub(r':\s*"([^"]*)\n"\s*,', r': "\1",', json_str)
			
 
				+
			
 
				+    # Remove trailing commas in objects and arrays
			
 
				+    json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
			
 
				+
			
 
				+    # Fix missing quotes around keys (sometimes LLMs output unquoted keys)
			
 
				+    json_str = re.sub(r"([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:", r'\1"\2":', json_str)
			
 
				+
			
 
				+    # Fix single quotes to double quotes (JSON requires double quotes)
			
 
				+    json_str = re.sub(r"'([^']*)'", r'"\1"', json_str)
			
 
				+
			
 
				+    # Remove control characters that are not allowed in JSON strings
			
 
				+    # Keep only printable ASCII and basic whitespace
			
 
				+    json_str = "".join(char for char in json_str if ord(char) >= 32 or char in "\t\r ")
			
 
				+
			
 
				+    # Fix null-like values that should be proper JSON null
			
 
				+    json_str = re.sub(r":\s*None\s*,", ": null,", json_str, flags=re.IGNORECASE)
			
 
				+    json_str = re.sub(r":\s*undefined\s*,", ": null,", json_str, flags=re.IGNORECASE)
			
 
				+
			
 
				+    return json_str
			
 
				+
			
 
				+
			
 
				+def extract_json_from_response(response: str) -> Tuple[Dict[str, Any], bool]:
			
 
				+    """
			
 
				+    Robust JSON extraction from LLM responses with comprehensive error handling.
			
 
				+
			
 
				+    Args:
			
 
				+        response: Raw response text from LLM
			
 
				+
			
 
				+    Returns:
			
 
				+        Tuple of (extracted_json_dict, has_error)
			
 
				+    """
			
 
				+    if not response or not response.strip():
			
 
				+        logger.warning("Empty response provided")
			
 
				+        return {}, True
			
 
				+
			
 
				+    # Strategy 1: Look for JSON content between triple backticks
			
 
				+    json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", response, re.DOTALL)
			
 
				+    if json_match:
			
 
				+        json_str = json_match.group(1)
			
 
				+    else:
			
 
				+        # Strategy 2: Look for JSON object pattern (handle nested braces)
			
 
				+        json_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", response, re.DOTALL)
			
 
				+        if json_match:
			
 
				+            json_str = json_match.group(0)
			
 
				+        else:
			
 
				+            # Strategy 3: Find content between first { and last }
			
 
				+            start_idx = response.find("{")
			
 
				+            end_idx = response.rfind("}")
			
 
				+            if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
			
 
				+                json_str = response[start_idx : end_idx + 1]
			
 
				+            else:
			
 
				+                logger.warning("No JSON pattern found in response")
			
 
				+                logger.debug(f"Response snippet: {response[:200]}...")
			
 
				+                return {}, True
			
 
				+
			
 
				+    # Clean the extracted JSON string
			
 
				+    original_json_str = json_str
			
 
				+    json_str = clean_json_string(json_str)
			
 
				+
			
 
				+    # Attempt to parse with multiple strategies
			
 
				+    parsing_strategies = [
			
 
				+        ("direct", lambda s: json.loads(s)),
			
 
				+        ("strip_whitespace", lambda s: json.loads(s.strip())),
			
 
				+        (
			
 
				+            "fix_escapes",
			
 
				+            lambda s: json.loads(s.replace("\\\\", "\\").replace('\\"', '"')),
			
 
				+        ),
			
 
				+    ]
			
 
				+
			
 
				+    for strategy_name, parse_func in parsing_strategies:
			
 
				+        try:
			
 
				+            parsed_json = parse_func(json_str)
			
 
				+
			
 
				+            # Validate that it's a dictionary (expected for most use cases)
			
 
				+            if not isinstance(parsed_json, dict):
			
 
				+                logger.warning(
			
 
				+                    f"Extracted JSON is not a dictionary: {type(parsed_json)}"
			
 
				+                )
			
 
				+                continue
			
 
				+
			
 
				+            logger.debug(f"Successfully parsed JSON using strategy: {strategy_name}")
			
 
				+            return parsed_json, False
			
 
				+
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            logger.debug(f"Strategy '{strategy_name}' failed: {e}")
			
 
				+            continue
			
 
				+        except Exception as e:
			
 
				+            logger.debug(f"Unexpected error in strategy '{strategy_name}': {e}")
			
 
				+            continue
			
 
				+
			
 
				+    # If all strategies fail, log details for debugging
			
 
				+    logger.error("All JSON parsing strategies failed")
			
 
				+    logger.debug(f"Original JSON string (first 500 chars): {original_json_str[:500]}")
			
 
				+    logger.debug(f"Cleaned JSON string (first 500 chars): {json_str[:500]}")
			
 
				+
			
 
				+    return {}, True
			
 
				+
			
 
				+
			
 
				+def generate_prompt(structured=True) -> str:
			
 
				+    """Generate prompt for the model."""
			
 
				+    json_schema = W2Form.model_json_schema()
			
 
				+
			
 
				+    prompt = (
			
 
				+        "You are an expert document information extraction system. "
			
 
				+        "I will show you an image of a W-2 tax form. "
			
 
				+        "Please extract all the information from this form and return it in a JSON format. "
			
 
				+        "Include all fields such as employee details, employer details, wages, federal income tax withheld, "
			
 
				+        "social security wages, social security tax withheld, medicare wages and tips, medicare tax withheld, "
			
 
				+        "and any other information present on the form. "
			
 
				+    )
			
 
				+
			
 
				+    if not structured:
			
 
				+        prompt += f"Return ONLY the JSON output without any additional text or explanations following this schema {json_schema}"
			
 
				+
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+def call_api_client(
			
 
				+    client: OpenAI,
			
 
				+    messages: List[Dict],
			
 
				+    model: str = "meta-llama/Llama-3.2-11B-Vision-Instruct",
			
 
				+    temperature: float = 0.0,
			
 
				+    max_tokens: int = 8192,
			
 
				+    response_format: Optional[Dict] = None,
			
 
				+    timeout: int = 300,
			
 
				+    seed: Optional[int] = 42,
			
 
				+):
			
 
				+    """
			
 
				+    Call compatible API server using OpenAI-compatible client.
			
 
				+    """
			
 
				+    try:
			
 
				+        kwargs = {
			
 
				+            "model": model,
			
 
				+            "messages": messages,
			
 
				+            "temperature": temperature,
			
 
				+            "max_tokens": max_tokens,
			
 
				+            "timeout": timeout,
			
 
				+        }
			
 
				+
			
 
				+        # Add seed if provided for reproducible generation
			
 
				+        if seed is not None:
			
 
				+            kwargs["seed"] = seed
			
 
				+
			
 
				+        # Add response format if structured output is enabled
			
 
				+        if response_format:
			
 
				+            kwargs["response_format"] = response_format
			
 
				+
			
 
				+        logger.debug(f"Making API client call with model: {model}")
			
 
				+        response = client.chat.completions.create(**kwargs)
			
 
				+
			
 
				+        logger.debug(f"Received response with {len(response.choices)} choices")
			
 
				+        return response
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"API client call failed: {e}")
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+def process_single_sample(
			
 
				+    client: OpenAI,
			
 
				+    sample_data: Tuple[int, Dict],
			
 
				+    output_dir: str,
			
 
				+    model: str,
			
 
				+    structured: bool,
			
 
				+    timeout: int,
			
 
				+) -> Dict[str, Any]:
			
 
				+    """Process a single sample using OpenAI SDK."""
			
 
				+    idx, sample = sample_data
			
 
				+
			
 
				+    try:
			
 
				+        # Get image
			
 
				+        image = sample["image"]
			
 
				+
			
 
				+        # Save image temporarily
			
 
				+        image_path = get_image_path(image, output_dir, idx)
			
 
				+        logger.debug(f"Saved image to {image_path}")
			
 
				+
			
 
				+        # Generate prompt and messages
			
 
				+        prompt = generate_prompt(structured)
			
 
				+        messages = create_messages(prompt, image_path)
			
 
				+
			
 
				+        # Prepare response format for structured output
			
 
				+        response_format = None
			
 
				+        if structured:
			
 
				+            json_schema = W2Form.model_json_schema()
			
 
				+            response_format = {
			
 
				+                "type": "json_schema",
			
 
				+                "json_schema": {
			
 
				+                    "name": "W2Form",
			
 
				+                    "schema": json_schema,
			
 
				+                    "strict": True,
			
 
				+                },
			
 
				+            }
			
 
				+
			
 
				+        # Call API client
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+        try:
			
 
				+            response = call_api_client(
			
 
				+                client=client,
			
 
				+                messages=messages,
			
 
				+                model=model,
			
 
				+                response_format=response_format,
			
 
				+                timeout=timeout,
			
 
				+            )
			
 
				+
			
 
				+            content = response.choices[0].message.content
			
 
				+            usage = response.usage.model_dump() if response.usage else {}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Error calling OpenAI SDK for sample {idx}: {e}")
			
 
				+            content = ""
			
 
				+            usage = {}
			
 
				+
			
 
				+        processing_time = time.time() - start_time
			
 
				+
			
 
				+        # Extract JSON from response
			
 
				+        extracted_json, json_parsing_error = extract_json_from_response(content)
			
 
				+
			
 
				+        # Get ground truth
			
 
				+        ground_truth_raw = json.loads(sample["ground_truth"])
			
 
				+
			
 
				+        # Handle the gt_parse wrapper structure if present
			
 
				+        if "gt_parse" in ground_truth_raw:
			
 
				+            ground_truth = ground_truth_raw["gt_parse"]
			
 
				+        else:
			
 
				+            ground_truth = ground_truth_raw
			
 
				+
			
 
				+        # Normalize for comparison
			
 
				+        normalized_pred = normalize_json(extracted_json)
			
 
				+        normalized_gt = normalize_json(ground_truth)
			
 
				+
			
 
				+        # Save results
			
 
				+        result = {
			
 
				+            "sample_id": idx,
			
 
				+            "prediction": extracted_json,
			
 
				+            "ground_truth": ground_truth,
			
 
				+            "normalized_prediction": normalized_pred,
			
 
				+            "normalized_gt": normalized_gt,
			
 
				+            "raw_response": content,
			
 
				+            "processing_time": processing_time,
			
 
				+            "json_parsing_error": json_parsing_error,
			
 
				+            "usage": usage,
			
 
				+        }
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        traceback_str = traceback.format_exc()
			
 
				+        logger.error(f"Error processing sample {idx}: {str(e)} at line {traceback_str}")
			
 
				+        return {
			
 
				+            "sample_id": idx,
			
 
				+            "prediction": {},
			
 
				+            "ground_truth": {},
			
 
				+            "normalized_prediction": {},
			
 
				+            "normalized_gt": {},
			
 
				+            "raw_response": "",
			
 
				+            "processing_time": 0.0,
			
 
				+            "json_parsing_error": True,
			
 
				+            "usage": {},
			
 
				+            "error": str(e),
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def calculate_metrics(results: List[Dict]) -> Dict[str, Any]:
			
 
				+    """Calculate accuracy metrics for the predictions."""
			
 
				+    if not results:
			
 
				+        logger.error("No results provided")
			
 
				+        return {"accuracy": 0.0, "field_accuracy": {}}
			
 
				+
			
 
				+    # Initialize metrics
			
 
				+    total_fields = 0
			
 
				+    correct_fields = 0
			
 
				+    parse_errors = 0
			
 
				+    total_records = len(results)
			
 
				+    logger.info(f"Total records: {total_records}")
			
 
				+    field_counts = {}
			
 
				+    field_correct = {}
			
 
				+
			
 
				+    for result in results:
			
 
				+        pred, gt = result["prediction"], result["ground_truth"]
			
 
				+
			
 
				+        if result["json_parsing_error"]:
			
 
				+            parse_errors += 1
			
 
				+            total_fields += len(gt)
			
 
				+            continue
			
 
				+
			
 
				+        for field in gt.keys():
			
 
				+            # Count total occurrences of this field
			
 
				+            field_counts[field] = field_counts.get(field, 0) + 1
			
 
				+            total_fields += 1
			
 
				+
			
 
				+            # Check if field is correct
			
 
				+            if field in pred and pred[field] == gt[field]:
			
 
				+                correct_fields += 1
			
 
				+                field_correct[field] = field_correct.get(field, 0) + 1
			
 
				+
			
 
				+    # Calculate overall accuracy
			
 
				+    accuracy = correct_fields / total_fields if total_fields > 0 else 0.0
			
 
				+    errors = parse_errors / total_records if total_records > 0 else 0.0
			
 
				+
			
 
				+    # Calculate per-field accuracy
			
 
				+    field_accuracy = {}
			
 
				+    for field in field_counts:
			
 
				+        field_accuracy[field] = field_correct.get(field, 0) / field_counts[field]
			
 
				+
			
 
				+    return {
			
 
				+        "accuracy": accuracy,
			
 
				+        "field_accuracy": field_accuracy,
			
 
				+        "parse_error": errors,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def normalize_field_value(value: Any) -> str:
			
 
				+    """Normalize field values for comparison."""
			
 
				+    if value is None:
			
 
				+        return ""
			
 
				+
			
 
				+    # Convert to string and normalize
			
 
				+    value_str = str(value).strip().lower()
			
 
				+
			
 
				+    # Remove common separators in numbers
			
 
				+    value_str = value_str.replace(",", "").replace(" ", "")
			
 
				+
			
 
				+    # Try to convert to float for numeric comparison
			
 
				+    try:
			
 
				+        value_float = float(value_str)
			
 
				+        return str(value_float)
			
 
				+    except ValueError:
			
 
				+        return value_str
			
 
				+
			
 
				+
			
 
				+def normalize_json(json_obj: Dict) -> Dict:
			
 
				+    """Normalize JSON object for comparison."""
			
 
				+    normalized = {}
			
 
				+
			
 
				+    for key, value in json_obj.items():
			
 
				+        # Normalize key (lowercase, remove spaces)
			
 
				+        norm_key = key.lower().replace(" ", "_")
			
 
				+
			
 
				+        # Normalize value
			
 
				+        if isinstance(value, dict):
			
 
				+            normalized[norm_key] = normalize_json(value)
			
 
				+        elif isinstance(value, list):
			
 
				+            normalized[norm_key] = [normalize_field_value(v) for v in value]
			
 
				+        else:
			
 
				+            normalized[norm_key] = normalize_field_value(value)
			
 
				+
			
 
				+    return normalized
			
 
				+
			
 
				+
			
 
				+def get_image_path(image: Image.Image, output_dir: str, idx: int) -> str:
			
 
				+    """Get the path to save the image."""
			
 
				+    # Create a temporary file for the image
			
 
				+    temp_dir = pathlib.Path(output_dir) / "temp"
			
 
				+    os.makedirs(temp_dir, exist_ok=True)
			
 
				+    image_path = temp_dir / f"temp_{idx}.png"
			
 
				+    image_path = str(image_path.resolve())
			
 
				+    image.save(image_path)
			
 
				+    return image_path
			
 
				+
			
 
				+
			
 
				+def vllm_openai_sdk_evaluation(
			
 
				+    test_set,
			
 
				+    output_dir: str,
			
 
				+    server_url: str = "http://localhost:8001",
			
 
				+    api_key: str = "default-blank-localhost",
			
 
				+    model: str = "meta-llama/Llama-3.2-11B-Vision-Instruct",
			
 
				+    structured: bool = True,
			
 
				+    timeout: int = 300,
			
 
				+    max_workers: int = 10,
			
 
				+):
			
 
				+    """
			
 
				+    Evaluate the W2 extraction task using OpenAI SDK with batch processing.
			
 
				+    """
			
 
				+    # Initialize OpenAI client
			
 
				+    client = OpenAI(
			
 
				+        api_key=api_key,  # vLLM doesn't require a real API key
			
 
				+        base_url=f"{server_url}/v1",
			
 
				+    )
			
 
				+
			
 
				+    # Prepare sample data for batch processing
			
 
				+    sample_data = [(idx, sample) for idx, sample in enumerate(test_set)]
			
 
				+
			
 
				+    results = []
			
 
				+
			
 
				+    # Use ThreadPoolExecutor for concurrent processing
			
 
				+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				+        # Submit all tasks
			
 
				+        future_to_sample = {
			
 
				+            executor.submit(
			
 
				+                process_single_sample,
			
 
				+                client,
			
 
				+                data,
			
 
				+                output_dir,
			
 
				+                model,
			
 
				+                structured,
			
 
				+                timeout,
			
 
				+            ): data[0]
			
 
				+            for data in sample_data
			
 
				+        }
			
 
				+
			
 
				+        # Collect results with progress bar
			
 
				+        for future in tqdm(
			
 
				+            as_completed(future_to_sample),
			
 
				+            total=len(sample_data),
			
 
				+            desc="Processing samples with OpenAI SDK (batch)",
			
 
				+        ):
			
 
				+            sample_idx = future_to_sample[future]
			
 
				+            try:
			
 
				+                result = future.result()
			
 
				+                results.append(result)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"Exception in sample {sample_idx}: {e}")
			
 
				+                # Add error result
			
 
				+                results.append(
			
 
				+                    {
			
 
				+                        "sample_id": sample_idx,
			
 
				+                        "prediction": {},
			
 
				+                        "ground_truth": {},
			
 
				+                        "normalized_prediction": {},
			
 
				+                        "normalized_gt": {},
			
 
				+                        "raw_response": "",
			
 
				+                        "processing_time": 0.0,
			
 
				+                        "json_parsing_error": True,
			
 
				+                        "usage": {},
			
 
				+                        "error": str(e),
			
 
				+                    }
			
 
				+                )
			
 
				+
			
 
				+    # Sort results by sample_id to maintain order
			
 
				+    results.sort(key=lambda x: x["sample_id"])
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def vllm_openai_sdk_sequential_evaluation(
			
 
				+    test_set,
			
 
				+    output_dir: str,
			
 
				+    server_url: str = "http://localhost:8001",
			
 
				+    api_key: str = "default-blank-localhost",
			
 
				+    model: str = "meta-llama/Llama-3.2-11B-Vision-Instruct",
			
 
				+    structured: bool = True,
			
 
				+    timeout: int = 300,
			
 
				+):
			
 
				+    """
			
 
				+    Evaluate the W2 extraction task using OpenAI SDK sequentially (for debugging).
			
 
				+    """
			
 
				+    # Initialize OpenAI client
			
 
				+    client = OpenAI(
			
 
				+        api_key=api_key,  # vLLM doesn't require a real API key
			
 
				+        base_url=f"{server_url}/v1",
			
 
				+    )
			
 
				+
			
 
				+    results = []
			
 
				+
			
 
				+    for idx, sample in enumerate(
			
 
				+        tqdm(test_set, desc="Processing samples with OpenAI SDK (sequential)")
			
 
				+    ):
			
 
				+        result = process_single_sample(
			
 
				+            client, (idx, sample), output_dir, model, structured, timeout
			
 
				+        )
			
 
				+        results.append(result)
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Evaluate vision-language model on W2 tax form dataset using OpenAI SDK"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--server_url",
			
 
				+        type=str,
			
 
				+        default="http://localhost:8001",
			
 
				+        help="URL of the vLLM HTTP server",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--model",
			
 
				+        type=str,
			
 
				+        default="meta-llama/Llama-3.2-11B-Vision-Instruct",
			
 
				+        help="Model name to use for inference",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--dataset_name",
			
 
				+        type=str,
			
 
				+        default="singhsays/fake-w2-us-tax-form-dataset",
			
 
				+        help="Name of the Huggingface dataset",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output_dir",
			
 
				+        type=str,
			
 
				+        default="./w2_evaluation_results",
			
 
				+        help="Directory to save evaluation results",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--limit",
			
 
				+        type=int,
			
 
				+        default=10,
			
 
				+        help="Number of samples to evaluate (default: 10, use -1 for all)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--structured",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="Whether to use structured output (JSON schema)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--timeout",
			
 
				+        type=int,
			
 
				+        default=300,
			
 
				+        help="Timeout for SDK requests in seconds",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--max_workers",
			
 
				+        type=int,
			
 
				+        default=10,
			
 
				+        help="Maximum number of concurrent workers for batch processing",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--sequential",
			
 
				+        action="store_true",
			
 
				+        default=False,
			
 
				+        help="Process samples sequentially instead of in parallel (for debugging)",
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Create output directory
			
 
				+    os.makedirs(args.output_dir, exist_ok=True)
			
 
				+
			
 
				+    # Load dataset
			
 
				+    logger.info(f"Loading dataset: {args.dataset_name}")
			
 
				+    test_set = None
			
 
				+    if Path(args.dataset_name, "state.json").exists():
			
 
				+        test_set = load_from_disk(args.dataset_name)
			
 
				+    else:
			
 
				+        dataset = load_dataset(args.dataset_name)
			
 
				+        if "test" not in dataset:
			
 
				+            logger.error("Dataset does not have a test split")
			
 
				+            return 1
			
 
				+        test_set = dataset["test"]
			
 
				+
			
 
				+    logger.info(f"Loaded test set with {len(test_set)} samples")
			
 
				+
			
 
				+    # Limit number of samples if specified
			
 
				+    if args.limit > 0 and args.limit < len(test_set):
			
 
				+        test_set = test_set.select(range(args.limit))
			
 
				+        logger.info(f"Limited to {args.limit} samples")
			
 
				+
			
 
				+    # Get API key from environment variable
			
 
				+    api_key = os.getenv("LLAMA_API_KEY") or os.getenv("OPENAI_API_KEY")
			
 
				+
			
 
				+    if not api_key:
			
 
				+        logger.warning(
			
 
				+            "No API key found. Please set the LLAMA_API_KEY or OPENAI_API_KEY environment variable for public APIs."
			
 
				+        )
			
 
				+        api_key = "default-blank-localhost"
			
 
				+
			
 
				+    # Test server connection
			
 
				+    try:
			
 
				+        client = OpenAI(
			
 
				+            api_key=api_key,
			
 
				+            base_url=f"{args.server_url}/v1",
			
 
				+        )
			
 
				+        # Test with a simple call
			
 
				+        models = client.models.list()
			
 
				+        logger.info(f"Successfully connected to vLLM server at {args.server_url}")
			
 
				+        logger.info(f"Available models: {[model.id for model in models.data]}")
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Failed to connect to vLLM server at {args.server_url}: {e}")
			
 
				+        logger.error("Make sure the vLLM server is running and accessible")
			
 
				+        return 1
			
 
				+
			
 
				+    # Run evaluation
			
 
				+    if args.sequential:
			
 
				+        logger.info("Running sequential evaluation...")
			
 
				+        results = vllm_openai_sdk_sequential_evaluation(
			
 
				+            test_set=test_set,
			
 
				+            output_dir=args.output_dir,
			
 
				+            server_url=args.server_url,
			
 
				+            api_key=api_key,
			
 
				+            model=args.model,
			
 
				+            structured=args.structured,
			
 
				+            timeout=args.timeout,
			
 
				+        )
			
 
				+    else:
			
 
				+        logger.info(f"Running batch evaluation with {args.max_workers} workers...")
			
 
				+        results = vllm_openai_sdk_evaluation(
			
 
				+            test_set=test_set,
			
 
				+            output_dir=args.output_dir,
			
 
				+            server_url=args.server_url,
			
 
				+            api_key=api_key,
			
 
				+            model=args.model,
			
 
				+            structured=args.structured,
			
 
				+            timeout=args.timeout,
			
 
				+            max_workers=args.max_workers,
			
 
				+        )
			
 
				+
			
 
				+    # Save detailed results
			
 
				+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+    try:
			
 
				+        results_file = os.path.join(args.output_dir, f"results_{timestamp}.json")
			
 
				+        with open(results_file, "w") as f:
			
 
				+            json.dump(results, f, indent=2)
			
 
				+        logger.info(f"Detailed results saved to {results_file}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Error saving detailed results: {str(e)}")
			
 
				+        return 1
			
 
				+
			
 
				+    # Calculate metrics
			
 
				+    metrics = calculate_metrics(results)
			
 
				+
			
 
				+    # Save evaluation summary
			
 
				+    output_file = os.path.join(args.output_dir, f"evaluation_results_{timestamp}.json")
			
 
				+    arguments = {
			
 
				+        "server_url": args.server_url,
			
 
				+        "model": args.model,
			
 
				+        "output_dir": args.output_dir,
			
 
				+        "dataset_name": args.dataset_name,
			
 
				+        "limit": args.limit,
			
 
				+        "structured": args.structured,
			
 
				+        "timeout": args.timeout,
			
 
				+        "max_workers": args.max_workers,
			
 
				+        "sequential": args.sequential,
			
 
				+        "prompt": generate_prompt(args.structured),
			
 
				+    }
			
 
				+
			
 
				+    summary = {
			
 
				+        "arguments": arguments,
			
 
				+        "metrics": metrics,
			
 
				+        "timestamp": timestamp,
			
 
				+        "total_samples": len(results),
			
 
				+    }
			
 
				+
			
 
				+    with open(output_file, "w") as f:
			
 
				+        json.dump(summary, f, indent=2)
			
 
				+
			
 
				+    # Print summary
			
 
				+    logger.info("=" * 50)
			
 
				+    logger.info("EVALUATION SUMMARY")
			
 
				+    logger.info("=" * 50)
			
 
				+    logger.info(f"Overall accuracy: {metrics['accuracy']:.4f}")
			
 
				+    logger.info(f"Parse error rate: {metrics['parse_error']:.4f}")
			
 
				+    logger.info("Field-level accuracy:")
			
 
				+    field_accuracy = metrics["field_accuracy"]
			
 
				+    for field, acc in sorted(field_accuracy.items(), key=lambda x: x[1], reverse=True):
			
 
				+        logger.info(f"  {field}: {acc:.4f}")
			
 
				+
			
 
				+    logger.info(f"Results saved to {output_file}")
			
 
				+
			
 
				+    # Clean up temp directory if it exists
			
 
				+    temp_dir = os.path.join(args.output_dir, "temp")
			
 
				+    if os.path.exists(temp_dir):
			
 
				+        import shutil
			
 
				+
			
 
				+        shutil.rmtree(temp_dir)
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/getting-started/finetuning/vision/prepare_w2_dataset.py
+++ b/getting-started/finetuning/vision/prepare_w2_dataset.py
@@ -0,0 +1,237 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Script to modify the dataset by removing the top-level 'gt_parse' attribute from the ground_truth column
			
 
				+and keeping all the keys under it. Also supports custom train-test splits.
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				+
			
 
				+# Configure logging
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s - %(levelname)s - %(message)s",
			
 
				+    datefmt="%Y-%m-%d %H:%M:%S",
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def parse_args():
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Prepare W2 dataset with custom train-test splits"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--train-ratio",
			
 
				+        type=float,
			
 
				+        default=0.8,
			
 
				+        help="Ratio of data to use for training (default: 0.8, i.e., 80%% train, 20%% test)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output-dir",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help="Custom output directory name. If not provided, will use 'fake_w2_us_tax_form_dataset_train{train_ratio}_test{1 - train_ratio}'",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--seed",
			
 
				+        type=int,
			
 
				+        default=42,
			
 
				+        help="Random seed for dataset splitting (default: 42)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--prompt",
			
 
				+        type=str,
			
 
				+        default="Parse this W-2 form and extract all fields into a single level json.",
			
 
				+        help="Custom prompt to use for the input field (default: Parse this W-2 form...)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--dataset-name",
			
 
				+        type=str,
			
 
				+        default="singhsays/fake-w2-us-tax-form-dataset",
			
 
				+        help="Dataset name from HuggingFace Hub (default: singhsays/fake-w2-us-tax-form-dataset)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--skip-validation",
			
 
				+        action="store_true",
			
 
				+        help="Skip validation split loading (useful if dataset doesn't have validation split)",
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+
			
 
				+# Define a function to modify the ground_truth column
			
 
				+def remove_gt_parse_wrapper(example):
			
 
				+    try:
			
 
				+        # Parse the ground_truth JSON
			
 
				+        ground_truth = json.loads(example["ground_truth"])
			
 
				+
			
 
				+        # Check if gt_parse exists in the ground_truth
			
 
				+        if "gt_parse" in ground_truth:
			
 
				+            # Replace the ground_truth with just the contents of gt_parse
			
 
				+            example["ground_truth"] = json.dumps(ground_truth["gt_parse"])
			
 
				+        else:
			
 
				+            logger.warning("No 'gt_parse' key found in ground_truth, keeping original")
			
 
				+
			
 
				+        return example
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        logger.error(f"Failed to parse ground_truth JSON: {e}")
			
 
				+        logger.error(f"Problematic data: {example.get('ground_truth', 'N/A')}")
			
 
				+        # Return the example unchanged if we can't parse it
			
 
				+        return example
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Unexpected error in remove_gt_parse_wrapper: {e}")
			
 
				+        return example
			
 
				+
			
 
				+
			
 
				+def validate_dataset(dataset):
			
 
				+    """Validate the loaded dataset has required columns."""
			
 
				+    required_columns = ["ground_truth", "image"]
			
 
				+    missing_columns = [
			
 
				+        col for col in required_columns if col not in dataset.column_names
			
 
				+    ]
			
 
				+
			
 
				+    if missing_columns:
			
 
				+        raise ValueError(f"Dataset missing required columns: {missing_columns}")
			
 
				+
			
 
				+    logger.info(f"Dataset validation passed. Columns: {dataset.column_names}")
			
 
				+
			
 
				+
			
 
				+def validate_train_ratio(train_ratio):
			
 
				+    """Validate that train ratio is between 0 and 1 (exclusive)."""
			
 
				+    if train_ratio <= 0 or train_ratio >= 1:
			
 
				+        raise ValueError("Train ratio must be between 0 and 1 (exclusive)")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def create_output_directory_name(train_ratio, test_ratio, output_dir=None):
			
 
				+    """Create output directory name based on the split ratio if not provided."""
			
 
				+    if output_dir is None:
			
 
				+        # Round to 2 decimal places before converting to int to avoid floating point precision issues
			
 
				+        train_pct = int(round(train_ratio * 100, 2))
			
 
				+        test_pct = int(round(test_ratio * 100, 2))
			
 
				+        return f"fake_w2_us_tax_form_dataset_train{train_pct}_test{test_pct}"
			
 
				+    return output_dir
			
 
				+
			
 
				+
			
 
				+def load_dataset_safely(dataset_name, split="train+test"):
			
 
				+    """Load dataset with proper error handling."""
			
 
				+    try:
			
 
				+        return load_dataset(dataset_name, split=split)
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Failed to load dataset '{dataset_name}': {e}")
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+def create_splits(all_data, train_ratio, seed):
			
 
				+    """Create train-test splits from the dataset."""
			
 
				+    logger.info(f"Creating new splits with train ratio: {train_ratio}")
			
 
				+    return all_data.train_test_split(train_size=train_ratio, seed=seed)
			
 
				+
			
 
				+
			
 
				+def load_validation_split(dataset_name, split_ds, skip_validation=False):
			
 
				+    """Load validation split if not skipped."""
			
 
				+    if skip_validation:
			
 
				+        logger.info("Skipping validation split as requested")
			
 
				+        return split_ds
			
 
				+
			
 
				+    try:
			
 
				+        split_ds["validation"] = load_dataset(dataset_name, split="validation")
			
 
				+        logger.info(
			
 
				+            f"Loaded validation split with {len(split_ds['validation'])} examples"
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        logger.warning(
			
 
				+            f"Could not load validation split: {e}. Continuing without validation split."
			
 
				+        )
			
 
				+
			
 
				+    return split_ds
			
 
				+
			
 
				+
			
 
				+def apply_transformations(split_ds, prompt):
			
 
				+    """Apply data transformations to the dataset."""
			
 
				+    logger.info("Modifying dataset...")
			
 
				+    modified_ds = split_ds.map(remove_gt_parse_wrapper)
			
 
				+
			
 
				+    logger.info(f"Adding custom prompt: {prompt}")
			
 
				+    modified_ds = modified_ds.map(lambda _: {"input": prompt})
			
 
				+
			
 
				+    return modified_ds
			
 
				+
			
 
				+
			
 
				+def log_dataset_statistics(all_data, modified_ds):
			
 
				+    """Log comprehensive dataset statistics."""
			
 
				+    logger.info("\n=== Dataset Statistics ===")
			
 
				+    logger.info(f"Total examples: {len(all_data)}")
			
 
				+    logger.info(
			
 
				+        f"Train split: {len(modified_ds['train'])} examples ({len(modified_ds['train'])/len(all_data)*100:.1f}%)"
			
 
				+    )
			
 
				+    logger.info(
			
 
				+        f"Test split: {len(modified_ds['test'])} examples ({len(modified_ds['test'])/len(all_data)*100:.1f}%)"
			
 
				+    )
			
 
				+    if "validation" in modified_ds:
			
 
				+        logger.info(f"Validation split: {len(modified_ds['validation'])} examples")
			
 
				+
			
 
				+
			
 
				+def save_dataset(modified_ds, output_dir):
			
 
				+    """Save the modified dataset to disk."""
			
 
				+    logger.info(f"Saving modified dataset to '{output_dir}'...")
			
 
				+    modified_ds.save_to_disk(output_dir)
			
 
				+    logger.info(f"Done! Modified dataset saved to '{output_dir}'")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    try:
			
 
				+        args = parse_args()
			
 
				+
			
 
				+        # Reconfigure logging with user-specified level
			
 
				+        global logger
			
 
				+
			
 
				+        # Validate train ratio
			
 
				+        validate_train_ratio(args.train_ratio)
			
 
				+
			
 
				+        train_ratio = args.train_ratio
			
 
				+        test_ratio = 1 - train_ratio
			
 
				+
			
 
				+        # Create output directory name
			
 
				+        output_dir = create_output_directory_name(
			
 
				+            train_ratio, test_ratio, args.output_dir
			
 
				+        )
			
 
				+
			
 
				+        logger.info(f"Using train-test split: {train_ratio:.2f}-{test_ratio:.2f}")
			
 
				+        logger.info(f"Output directory will be: {output_dir}")
			
 
				+        logger.info(f"Dataset: {args.dataset_name}")
			
 
				+
			
 
				+        # Load the dataset with error handling
			
 
				+        logger.info("Loading dataset...")
			
 
				+        all_data = load_dataset_safely(args.dataset_name, "train+test")
			
 
				+
			
 
				+        validate_dataset(all_data)
			
 
				+        logger.info(f"Loaded {len(all_data)} examples from dataset")
			
 
				+
			
 
				+        # Create splits
			
 
				+        split_ds = create_splits(all_data, train_ratio, args.seed)
			
 
				+
			
 
				+        # Load validation split
			
 
				+        split_ds = load_validation_split(
			
 
				+            args.dataset_name, split_ds, args.skip_validation
			
 
				+        )
			
 
				+
			
 
				+        # Apply transformations
			
 
				+        modified_ds = apply_transformations(split_ds, args.prompt)
			
 
				+
			
 
				+        # Log statistics
			
 
				+        log_dataset_statistics(all_data, modified_ds)
			
 
				+
			
 
				+        # Save the modified dataset
			
 
				+        save_dataset(modified_ds, output_dir)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Script failed with error: {e}")
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()