10 miesięcy temu · 936c249933
--- a/src/finetune_pipeline/config.yaml
+++ b/src/finetune_pipeline/config.yaml
@@ -1,23 +1,92 @@
 
				+# # Configuration for data loading, formatting, and fine-tuning
			
 
				+
			
 
				+
			
 
				+# output_dir: "/tmp/finetune_pipeline/outputs/"  # Directory to store output files
			
 
				+
			
 
				+# data:
			
 
				+#   data_path: "dz-osamu/IU-Xray"  # Path to the dataset to format (either a Hugging Face dataset ID or a local path)
			
 
				+#   is_local: false                  # Whether the data is stored locally
			
 
				+#   # Maps custom column names to standard field names
			
 
				+#   column_mapping:
			
 
				+#     input: "query"             # Field containing the input text
			
 
				+#     output: "response"              # Field containing the output text
			
 
				+#     image: "image"           # Field containing the image path (optional)
			
 
				+#   # Additional arguments to pass to the load_dataset function
			
 
				+#   # dataset_kwargs:
			
 
				+#   #   split: "train"                # Dataset split to load
			
 
				+#   #   # Add any other dataset-specific arguments here
			
 
				+
			
 
				+
			
 
				+# # Formatter configuration
			
 
				+# formatter:
			
 
				+#   type: "vllm"  # Type of formatter to use ('torchtune', 'vllm', or 'openai')
			
 
				+
			
 
				+
			
 
				+# # # Something like this in the torchtune config
			
 
				+# # dataset:
			
 
				+# #   _component_: torchtune.datasets.CustomSFTDataset
			
 
				+# #   packed: False
			
 
				+# #   split: train
			
 
				+# # seed: null
			
 
				+# # shuffle: True
			
 
				+
			
 
				+
			
 
				+# # Training configuration
			
 
				+# finetuning:
			
 
				+#   strategy: "lora"               # Training strategy ('fft' or 'lora')
			
 
				+#   num_epochs: 1                 # Number of training epochs
			
 
				+#   batch_size: 1                 # Batch size per device for training
			
 
				+#   torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
			
 
				+#   num_processes_per_node: 8             # TorchTune-specific configuration
			
 
				+#   distributed: true             # Whether to use distributed training
			
 
				+
			
 
				+
			
 
				+# # vLLM Inference configuration
			
 
				+# inference:
			
 
				+#   # Model configuration
			
 
				+#   model_path: "/home/ubuntu/yash-workspace/medgemma-4b-it" # Path to the model checkpoint
			
 
				+#   quantization: null            # Quantization method (awq, gptq, squeezellm)
			
 
				+
			
 
				+#   # Server configuration
			
 
				+#   port: 8000                    # Port to run the server on
			
 
				+#   host: "0.0.0.0"               # Host to run the server on
			
 
				+
			
 
				+#   # Performance configuration
			
 
				+#   tensor_parallel_size: 1       # Number of GPUs to use for tensor parallelism
			
 
				+#   max_model_len: 32           # Maximum sequence length
			
 
				+#   max_num_seqs: 1              # Maximum number of sequences
			
 
				+#   gpu_memory_utilization: 0.9   # Fraction of GPU memory to use
			
 
				+#   enforce_eager: false          # Enforce eager execution
			
 
				+
			
 
				+#   eval_data: "your/eval/dataset/path" # Path to the evaluation dataset (optional)
			
 
				+
			
 
				+#   # Additional vLLM parameters (optional)
			
 
				+#   # swap_space: 4               # Size of CPU swap space in GiB
			
 
				+#   # block_size: 16              # Size of blocks used in the KV cache
			
 
				+#   # disable_log_stats: true     # Disable logging of stats
			
 
				+#   # disable_log_requests: false # Disable logging of requests
			
 
				+
			
 
				+
			
 
				+
			
 
				 # Configuration for data loading, formatting, and fine-tuning
			
 
				 
			
 
				 
			
 
				-output_dir: "/tmp/finetune_pipeline/outputs/"  # Directory to store output files
			
 
				+output_dir: "/home/ubuntu/yash-workspace/outputs"  # Directory to store output files
			
 
				 
			
 
				 # Formatter configuration
			
 
				 formatter:
			
 
				-  type: "vllm"  # Type of formatter to use ('torchtune', 'vllm', or 'openai')
			
 
				+  type: "torchtune"  # Type of formatter to use ('torchtune', 'vllm', or 'openai')
			
 
				   data_path: "dz-osamu/IU-Xray"  # Path to the dataset to format (either a Hugging Face dataset ID or a local path)
			
 
				   is_local: false                  # Whether the data is stored locally
			
 
				   # Maps custom column names to standard field names
			
 
				   column_mapping:
			
 
				     input: "query"             # Field containing the input text
			
 
				     output: "response"              # Field containing the output text
			
 
				-    image: null           # Field containing the image path (optional)
			
 
				+    image: "images"           # Field containing the image path (optional)
			
 
				 
			
 
				   # Additional arguments to pass to the load_dataset function
			
 
				   dataset_kwargs:
			
 
				     split: "train"                # Dataset split to load
			
 
				-    # Add any other dataset-specific arguments here
			
 
				 
			
 
				 # Training configuration
			
 
				 finetuning:
			
@@ -25,15 +94,17 @@ finetuning:
 
				   num_epochs: 1                 # Number of training epochs
			
 
				   batch_size: 1                 # Batch size per device for training
			
 
				   torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
			
 
				-  num_processes_per_node: 8             # TorchTune-specific configuration
			
 
				-  distributed: true             # Whether to use distributed training
			
 
				+  num_processes_per_node: 1             # TorchTune-specific configuration
			
 
				+  distributed: false             # Whether to use distributed training
			
 
				 
			
 
				 
			
 
				 # vLLM Inference configuration
			
 
				 inference:
			
 
				   # Model configuration
			
 
				-  model_path: "/home/ubuntu/yash-workspace/medgemma-4b-it" # Path to the model checkpoint
			
 
				+  model_path: "your/model/path" # Path to the model checkpoint
			
 
				   quantization: null            # Quantization method (awq, gptq, squeezellm)
			
 
				+  dtype: "auto"                 # Data type for model weights (half, float, bfloat16, auto)
			
 
				+  trust_remote_code: false      # Trust remote code when loading the model
			
 
				 
			
 
				   # Server configuration
			
 
				   port: 8000                    # Port to run the server on
			
@@ -41,8 +112,8 @@ inference:
 
				 
			
 
				   # Performance configuration
			
 
				   tensor_parallel_size: 1       # Number of GPUs to use for tensor parallelism
			
 
				-  max_model_len: 32           # Maximum sequence length
			
 
				-  max_num_seqs: 1              # Maximum number of sequences
			
 
				+  max_model_len: 1024           # Maximum sequence length
			
 
				+  max_num_seqs: 16              # Maximum number of sequences
			
 
				   gpu_memory_utilization: 0.9   # Fraction of GPU memory to use
			
 
				   enforce_eager: false          # Enforce eager execution
			
 
				 
			
@@ -53,3 +124,4 @@ inference:
 
				   # block_size: 16              # Size of blocks used in the KV cache
			
 
				   # disable_log_stats: true     # Disable logging of stats
			
 
				   # disable_log_requests: false # Disable logging of requests
			
 
				+
			
--- a/src/finetune_pipeline/data/data_loader.py
+++ b/src/finetune_pipeline/data/data_loader.py
@@ -79,7 +79,7 @@ def load_data(data_path: str, is_local: bool = False, **kwargs):
 
				         **kwargs: Additional arguments to pass to the load_dataset function
			
 
				 
			
 
				     Returns:
			
 
				-        Dataset object from the datasets library
			
 
				+        Dataset object from the datasets library with all splits
			
 
				 
			
 
				     Raises:
			
 
				         ImportError: If the datasets package is not installed
			
@@ -174,9 +174,18 @@ def convert_to_conversations(data, column_mapping: Optional[Dict] = None):
 
				         user_content = [
			
 
				             {"type": "text", "text": input_text},
			
 
				         ]
			
 
				-        # Add image to user content
			
 
				+        # Add image(s) to user content
			
 
				         if image is not None:
			
 
				-            user_content.append({"type": "image", "image_url": {"url": image}})
			
 
				+            if isinstance(image, list):
			
 
				+                # Handle list of images
			
 
				+                for img in image:
			
 
				+                    if img:  # Check if image path is not empty
			
 
				+                        user_content.append(
			
 
				+                            {"type": "image", "image_url": {"url": img}}
			
 
				+                        )
			
 
				+            else:
			
 
				+                # Handle single image
			
 
				+                user_content.append({"type": "image", "image_url": {"url": image}})
			
 
				 
			
 
				         user_message = {"role": "user", "content": user_content}
			
 
				 
			
@@ -197,7 +206,7 @@ def convert_to_conversations(data, column_mapping: Optional[Dict] = None):
 
				 
			
 
				 
			
 
				 def save_formatted_data(
			
 
				-    formatted_data: List[Any], output_dir: str, formatter_type: str
			
 
				+    formatted_data: List[Any], output_dir: str, formatter_type: str, split: str
			
 
				 ) -> str:
			
 
				     """
			
 
				     Save formatted data to a JSON file.
			
@@ -215,7 +224,7 @@ def save_formatted_data(
 
				 
			
 
				     # Define the output file path
			
 
				     formatted_data_path = os.path.join(
			
 
				-        output_dir, f"{formatter_type}_formatted_data.json"
			
 
				+        output_dir, f"{split}_{formatter_type}_formatted_data.json"
			
 
				     )
			
 
				 
			
 
				     # Save the formatted data
			
@@ -237,7 +246,7 @@ def save_formatted_data(
 
				     return formatted_data_path
			
 
				 
			
 
				 
			
 
				-def save_conversation_data(conversation_data: List, output_dir: str) -> str:
			
 
				+def save_conversation_data(conversation_data: List, output_dir: str, split: str) -> str:
			
 
				     """
			
 
				     Save conversation data to a JSON file.
			
 
				 
			
@@ -252,7 +261,7 @@ def save_conversation_data(conversation_data: List, output_dir: str) -> str:
 
				     os.makedirs(output_dir, exist_ok=True)
			
 
				 
			
 
				     # Define the output file path
			
 
				-    conversation_data_path = os.path.join(output_dir, "conversation_data.json")
			
 
				+    conversation_data_path = os.path.join(output_dir, f"{split}_conversation_data.json")
			
 
				 
			
 
				     # Convert Conversation objects to a serializable format
			
 
				     serializable_conversations = []
			
@@ -267,37 +276,103 @@ def save_conversation_data(conversation_data: List, output_dir: str) -> str:
 
				     return conversation_data_path
			
 
				 
			
 
				 
			
 
				-def format_data(data, formatter_type: str, column_mapping: Optional[Dict] = None):
			
 
				+def format_data(
			
 
				+    data,
			
 
				+    formatter_type: str,
			
 
				+    output_dir: str,
			
 
				+    column_mapping: Optional[Dict] = None,
			
 
				+    dataset_kwargs: Optional[Dict] = None,
			
 
				+):
			
 
				     """
			
 
				-    Format the data using the specified formatter.
			
 
				+    Format the data using the specified formatter for all splits.
			
 
				 
			
 
				     Args:
			
 
				-        data: Data to format
			
 
				+        data: Dataset with multiple splits to format or a single dataset
			
 
				         formatter_type: Type of formatter to use ('torchtune', 'vllm', or 'openai')
			
 
				+        output_dir: Directory to save the formatted data
			
 
				         column_mapping: Optional mapping of column names
			
 
				+        dataset_kwargs: Optional dataset kwargs that may contain split information
			
 
				 
			
 
				     Returns:
			
 
				-        Tuple containing formatted data and conversation data
			
 
				+        Tuple containing (formatted_data_paths, conversation_data_paths) where each is a list of paths to saved files
			
 
				     """
			
 
				-    # First convert the data to conversations
			
 
				-    conversations = convert_to_conversations(data, column_mapping)
			
 
				+    formatted_data_paths = []
			
 
				+    conversation_data_paths = []
			
 
				+
			
 
				+    # Check if the dataset has explicit splits
			
 
				+    if (
			
 
				+        hasattr(data, "keys")
			
 
				+        and callable(data.keys)
			
 
				+        and len(data.keys()) > 0
			
 
				+        and isinstance(data, dict)
			
 
				+    ):
			
 
				+        # Dataset has splits (train, validation, test, etc.)
			
 
				+        splits = data.keys()
			
 
				+
			
 
				+        for split in splits:
			
 
				+            # First convert the data to conversations
			
 
				+            conversations = convert_to_conversations(data[split], column_mapping)
			
 
				+
			
 
				+            # Then get the formatter and format the conversations
			
 
				+            formatter = get_formatter(formatter_type)
			
 
				+            formatted_data = formatter.format_data(conversations)
			
 
				+            print(
			
 
				+                f"Loaded and formatted data for split '{split}': {len(formatted_data)} samples"
			
 
				+            )
			
 
				+
			
 
				+            # Save the formatted data
			
 
				+            formatted_data_path = save_formatted_data(
			
 
				+                formatted_data, output_dir, formatter_type, split
			
 
				+            )
			
 
				+            formatted_data_paths.append(formatted_data_path)
			
 
				+
			
 
				+            # Save the conversation data
			
 
				+            conversation_data_path = save_conversation_data(
			
 
				+                conversations, output_dir, split
			
 
				+            )
			
 
				+            conversation_data_paths.append(conversation_data_path)
			
 
				+    else:
			
 
				+        # Dataset doesn't have explicit splits, treat it as a single dataset
			
 
				+        # Check if a split is specified in dataset_kwargs
			
 
				+        split = "default"
			
 
				+        if dataset_kwargs and "split" in dataset_kwargs:
			
 
				+            split = dataset_kwargs["split"]
			
 
				+
			
 
				+        # First convert the data to conversations
			
 
				+        conversations = convert_to_conversations(data, column_mapping)
			
 
				+
			
 
				+        # Then get the formatter and format the conversations
			
 
				+        formatter = get_formatter(formatter_type)
			
 
				+        formatted_data = formatter.format_data(conversations)
			
 
				+        print(
			
 
				+            f"Loaded and formatted data for split '{split}': {len(formatted_data)} samples"
			
 
				+        )
			
 
				 
			
 
				-    # Then get the formatter and format the conversations
			
 
				-    formatter = get_formatter(formatter_type)
			
 
				-    formatted_data = formatter.format_data(conversations)
			
 
				+        # Save the formatted data
			
 
				+        formatted_data_path = save_formatted_data(
			
 
				+            formatted_data, output_dir, formatter_type, split
			
 
				+        )
			
 
				+        formatted_data_paths.append(formatted_data_path)
			
 
				+
			
 
				+        # Save the conversation data
			
 
				+        conversation_data_path = save_conversation_data(
			
 
				+            conversations, output_dir, split
			
 
				+        )
			
 
				+        conversation_data_paths.append(conversation_data_path)
			
 
				 
			
 
				-    return formatted_data, conversations
			
 
				+    return formatted_data_paths, conversation_data_paths
			
 
				 
			
 
				 
			
 
				-def load_and_format_data(formatter_config: Dict):
			
 
				+def load_and_format_data(formatter_config: Dict, output_dir: str):
			
 
				     """
			
 
				     Load and format data based on the configuration.
			
 
				 
			
 
				     Args:
			
 
				         formatter_config: Dictionary containing formatter configuration parameters
			
 
				+        output_dir: Directory to save the formatted data
			
 
				 
			
 
				     Returns:
			
 
				-        Formatted data in the specified format
			
 
				+        Tuple containing (formatted_data_paths, conversation_data_paths) where each is a list of paths to saved files
			
 
				     """
			
 
				 
			
 
				     # Extract parameters from config
			
@@ -316,11 +391,11 @@ def load_and_format_data(formatter_config: Dict):
 
				     data = load_data(data_path, is_local, **dataset_kwargs)
			
 
				 
			
 
				     # Format the data
			
 
				-    formatted_data, conversation_data = format_data(
			
 
				-        data, formatter_type, column_mapping
			
 
				+    formatted_data_paths, conversation_data_paths = format_data(
			
 
				+        data, formatter_type, output_dir, column_mapping, dataset_kwargs
			
 
				     )
			
 
				 
			
 
				-    return formatted_data, conversation_data
			
 
				+    return formatted_data_paths, conversation_data_paths
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -341,14 +416,9 @@ if __name__ == "__main__":
 
				     # Read the configuration
			
 
				     config = read_config(args.config)
			
 
				     formatter_config = config.get("formatter", {})
			
 
				-    output_dir = config.get("output_dir")
			
 
				+    output_dir = config.get("output_dir", "/tmp/finetune-pipeline/data/")
			
 
				 
			
 
				     # Load and format the data
			
 
				-    formatted_data, conversation_data = load_and_format_data(formatter_config)
			
 
				-    print(f"Loaded and formatted data: {len(formatted_data)} samples")
			
 
				-
			
 
				-    # Save the data if output_dir is provided
			
 
				-    if output_dir:
			
 
				-        formatter_type = formatter_config.get("type", "torchtune")
			
 
				-        save_formatted_data(formatted_data, output_dir, formatter_type)
			
 
				-        save_conversation_data(conversation_data, output_dir)
			
 
				+    formatted_data_paths, conversation_data_paths = load_and_format_data(
			
 
				+        formatter_config, output_dir
			
 
				+    )
			
--- a/src/finetune_pipeline/finetuning/dataset.py
+++ b/src/finetune_pipeline/finetuning/dataset.py
@@ -0,0 +1,24 @@
 
				+from torchtune.datasets import SFTDataset
			
 
				+from torchtune.modules.transforms import Transform
			
 
				+from torchtune.data import OpenAIToMessages
			
 
				+
			
 
				+
			
 
				+def custom_sft_dataset(
			
 
				+    model_transform: Transform,
			
 
				+    *,
			
 
				+    split: str = "train",
			
 
				+    dataset_path: str = "files/synthetic_data/train.csv",
			
 
				+    train_on_input: bool = True,
			
 
				+) -> SFTDataset:
			
 
				+    """Creates a custom dataset."""
			
 
				+
			
 
				+    openaitomessage = OpenAIToMessages(train_on_input=train_on_input)
			
 
				+
			
 
				+    ds = SFTDataset(
			
 
				+        source="json",
			
 
				+        data_files=dataset_path,
			
 
				+        split="train",
			
 
				+        message_transform=openaitomessage,
			
 
				+        model_transform=Transform,
			
 
				+    )
			
 
				+    return ds
			
--- a/src/finetune_pipeline/inference/run_inference.py
+++ b/src/finetune_pipeline/inference/run_inference.py
@@ -70,7 +70,41 @@ class VLLMClient:
 
				             self.logger.error(f"Error sending request to vLLM server: {e}")
			
 
				             raise
			
 
				 
			
 
				-def run_inference_on_eval_data(
			
 
				+
			
 
				+def vllm_call_batch(llm, image_paths: List[str], structured):
			
 
				+    messages_batch = []
			
 
				+    for img_path in image_paths:
			
 
				+        messages = [
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": [
			
 
				+                    {"type": "image_url", "image_url": {"url": f"file:///{img_path}"}},
			
 
				+                    {
			
 
				+                        "type": "text",
			
 
				+                        "text": generate_prompt(structured),
			
 
				+                    },
			
 
				+                ],
			
 
				+            }
			
 
				+        ]
			
 
				+        messages_batch.append(messages)
			
 
				+
			
 
				+    # Using greedy decoding
			
 
				+    if structured:
			
 
				+        sampling_params = SamplingParams(
			
 
				+            temperature=0,
			
 
				+            top_p=1,
			
 
				+            max_tokens=8192,
			
 
				+            guided_decoding=guided_decoding_params,
			
 
				+        )
			
 
				+    else:
			
 
				+        sampling_params = SamplingParams(
			
 
				+            temperature=0,
			
 
				+            top_p=1,
			
 
				+            max_tokens=8192,
			
 
				+        )
			
 
				+    return llm.chat(messages_batch, sampling_params, use_tqdm=True)
			
 
				+
			
 
				+def run_vllm_batch_inference_on_dataset(
			
 
				     eval_data_path: str,
			
 
				     server_url: str = "http://localhost:8000/v1",
			
 
				     is_local: bool = False,