пре 5 месеци · c04bddcec2
--- a/src/finetune_pipeline/config.yaml
+++ b/src/finetune_pipeline/config.yaml
@@ -1,4 +1,4 @@
 
				-# Configuration for data loading and formatting
			
 
				+# Configuration for data loading, formatting, and fine-tuning
			
 
				 
			
 
				 # Data source configuration
			
 
				 data_path: "your/dataset/path"  # Path to the dataset (either a Hugging Face dataset ID or a local path)
			
@@ -18,3 +18,12 @@ column_mapping:
 
				 dataset_kwargs:
			
 
				   split: "train"                # Dataset split to load
			
 
				   # Add any other dataset-specific arguments here
			
 
				+
			
 
				+# Training configuration
			
 
				+finetuning:
			
 
				+  strategy: "lora"               # Training strategy ('fft' or 'lora')
			
 
				+  num_epochs: 1                 # Number of training epochs
			
 
				+  batch_size: 1                 # Batch size per device for training
			
 
				+  torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
			
 
				+  num_processes_per_node: 8             # TorchTune-specific configuration
			
 
				+  distributed: true             # Whether to use distributed training
			
--- a/src/finetune_pipeline/data/data_loader.py
+++ b/src/finetune_pipeline/data/data_loader.py
@@ -59,9 +59,6 @@ def read_config(config_path: str) -> Dict:
 
				                     "The 'pyyaml' package is required to load YAML files. "
			
 
				                     "Please install it with 'pip install pyyaml'."
			
 
				                 )
			
 
				-            # Only use yaml if it's available (HAS_YAML is True here)
			
 
				-            import yaml  # This import will succeed because we've already checked HAS_YAML
			
 
				-
			
 
				             config = yaml.safe_load(f)
			
 
				         else:
			
 
				             raise ValueError(
			
--- a/src/finetune_pipeline/finetuning/finetuning.py
+++ b/src/finetune_pipeline/finetuning/finetuning.py
@@ -0,0 +1,168 @@
 
				+#!/usr/bin/env python
			
 
				+"""
			
 
				+Fine-tuning script for language models using torch tune.
			
 
				+Reads parameters from a config file and runs the torch tune command.
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import logging
			
 
				+import subprocess
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Dict
			
 
				+
			
 
				+try:
			
 
				+    import yaml
			
 
				+    HAS_YAML = True
			
 
				+except ImportError:
			
 
				+    HAS_YAML = False
			
 
				+
			
 
				+# Configure logging
			
 
				+logging.basicConfig(
			
 
				+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
			
 
				+    datefmt="%Y-%m-%d %H:%M:%S",
			
 
				+    level=logging.INFO,
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+## Will import from dataloader eventually
			
 
				+def read_config(config_path: str) -> Dict:
			
 
				+    """
			
 
				+    Read the configuration file (supports both JSON and YAML formats).
			
 
				+
			
 
				+    Args:
			
 
				+        config_path: Path to the configuration file
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: Configuration parameters
			
 
				+
			
 
				+    Raises:
			
 
				+        ValueError: If the file format is not supported
			
 
				+        ImportError: If the required package for the file format is not installed
			
 
				+    """
			
 
				+    file_extension = Path(config_path).suffix.lower()
			
 
				+
			
 
				+    with open(config_path, "r") as f:
			
 
				+        if file_extension in [".json"]:
			
 
				+            config = json.load(f)
			
 
				+        elif file_extension in [".yaml", ".yml"]:
			
 
				+            if not HAS_YAML:
			
 
				+                raise ImportError(
			
 
				+                    "The 'pyyaml' package is required to load YAML files. "
			
 
				+                    "Please install it with 'pip install pyyaml'."
			
 
				+                )
			
 
				+            config = yaml.safe_load(f)
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                f"Unsupported config file format: {file_extension}. "
			
 
				+                f"Supported formats are: .json, .yaml, .yml"
			
 
				+            )
			
 
				+
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+def run_torch_tune(config_path: str, args=None):
			
 
				+    """
			
 
				+    Run torch tune command with parameters from config file.
			
 
				+
			
 
				+    Args:
			
 
				+        config_path: Path to the configuration file
			
 
				+        args: Command line arguments
			
 
				+    """
			
 
				+    # Read the configuration
			
 
				+    config = read_config(config_path)
			
 
				+
			
 
				+    # Extract parameters from config
			
 
				+    training_config = config.get("finetuning", {})
			
 
				+
			
 
				+    # Initialize base_cmd to avoid "possibly unbound" error
			
 
				+    base_cmd = []
			
 
				+
			
 
				+    # Determine the command based on configuration
			
 
				+    if training_config.get("distributed"):
			
 
				+        if training_config.get("strategy") == "lora":
			
 
				+            base_cmd = [
			
 
				+                "tune",
			
 
				+                "run",
			
 
				+                "--nproc_per_node",
			
 
				+                str(training_config.get("num_processes_per_node", 1)),
			
 
				+                "lora_finetune_distributed",
			
 
				+                "--config",
			
 
				+                training_config.get("torchtune_config"),
			
 
				+            ]
			
 
				+        elif training_config.get("strategy") == "fft":
			
 
				+            base_cmd = [
			
 
				+                "tune",
			
 
				+                "run",
			
 
				+                "--nproc_per_node",
			
 
				+                str(training_config.get("num_processes_per_node", 1)),
			
 
				+                "full_finetune_distributed",
			
 
				+                "--config",
			
 
				+                training_config.get("torchtune_config"),
			
 
				+            ]
			
 
				+        else:
			
 
				+            raise ValueError(f"Invalid strategy: {training_config.get('strategy')}")
			
 
				+
			
 
				+    else:
			
 
				+        if training_config.get("strategy") == "lora":
			
 
				+            base_cmd = [
			
 
				+                "tune",
			
 
				+                "run",
			
 
				+                "lora_finetune_single_device",
			
 
				+                "--config",
			
 
				+                training_config.get("torchtune_config"),
			
 
				+            ]
			
 
				+        elif training_config.get("strategy") == "fft":
			
 
				+            base_cmd = [
			
 
				+                "tune",
			
 
				+                "run",
			
 
				+                "full_finetune_single_device",
			
 
				+                "--config",
			
 
				+                training_config.get("torchtune_config"),
			
 
				+            ]
			
 
				+        else:
			
 
				+            raise ValueError(f"Invalid strategy: {training_config.get('strategy')}")
			
 
				+
			
 
				+    # Check if we have a valid command
			
 
				+    if not base_cmd:
			
 
				+        raise ValueError(
			
 
				+            "Could not determine the appropriate command based on the configuration"
			
 
				+        )
			
 
				+
			
 
				+    # Log the command
			
 
				+    logger.info(f"Running command: {' '.join(base_cmd)}")
			
 
				+
			
 
				+    # Run the command
			
 
				+    try:
			
 
				+        subprocess.run(base_cmd, check=True)
			
 
				+        logger.info("Training complete!")
			
 
				+    except subprocess.CalledProcessError as e:
			
 
				+        logger.error(f"Training failed with error: {e}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """Main function."""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Fine-tune a language model using torch tune"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--config",
			
 
				+        type=str,
			
 
				+        required=True,
			
 
				+        help="Path to the configuration file (JSON or YAML)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--kwargs",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help="Additional key-value pairs to pass to the command (comma-separated)",
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    run_torch_tune(args.config, args=args)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/finetune_pipeline/tests/test_formatter.py
+++ b/src/finetune_pipeline/tests/test_formatter.py
@@ -0,0 +1,235 @@
 
				+import sys
			
 
				+import unittest
			
 
				+from pathlib import Path
			
 
				+from unittest.mock import MagicMock
			
 
				+
			
 
				+# Add the parent directory to the path so we can import the modules
			
 
				+sys.path.append(str(Path(__file__).parent.parent))
			
 
				+
			
 
				+from data.data_loader import convert_to_conversations, format_data, load_data
			
 
				+from data.formatter import (
			
 
				+    Conversation,
			
 
				+    OpenAIFormatter,
			
 
				+    TorchtuneFormatter,
			
 
				+    vLLMFormatter,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class TestFormatter(unittest.TestCase):
			
 
				+    """Test cases for the formatter module."""
			
 
				+
			
 
				+    @classmethod
			
 
				+    def setUpClass(cls):
			
 
				+        """Set up test fixtures, called before any tests are run."""
			
 
				+        # Define a small dataset to use for testing
			
 
				+        cls.dataset_name = "dz-osamu/IU-Xray"
			
 
				+        cls.split = "train[:10]"  # Use only 10 samples for testing
			
 
				+
			
 
				+        try:
			
 
				+            # Load the dataset
			
 
				+            cls.dataset = load_data(cls.dataset_name, split=cls.split)
			
 
				+
			
 
				+            # Create a column mapping for the squad_v2 dataset
			
 
				+            cls.column_mapping = {
			
 
				+                "input": "query",
			
 
				+                "output": "response",
			
 
				+                "image": "images"
			
 
				+            }
			
 
				+
			
 
				+            # Convert to list for easier processing
			
 
				+            cls.data = list(cls.dataset)
			
 
				+
			
 
				+            # Convert to conversations
			
 
				+            cls.conversations = convert_to_conversations(cls.data, cls.column_mapping)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            print(f"Error setting up test fixtures: {e}")
			
 
				+            raise
			
 
				+
			
 
				+    def test_conversation_creation(self):
			
 
				+        """Test that conversations are created correctly."""
			
 
				+        self.assertIsNotNone(self.conversations)
			
 
				+        self.assertGreater(len(self.conversations), 0)
			
 
				+
			
 
				+        # Check that each conversation has at least two messages (user and assistant)
			
 
				+        for conversation in self.conversations:
			
 
				+            self.assertGreaterEqual(len(conversation.messages), 2)
			
 
				+            self.assertEqual(conversation.messages[0]["role"], "user")
			
 
				+            self.assertEqual(conversation.messages[1]["role"], "assistant")
			
 
				+
			
 
				+    def test_torchtune_formatter(self):
			
 
				+        """Test the TorchtuneFormatter."""
			
 
				+        formatter = TorchtuneFormatter()
			
 
				+
			
 
				+        # Test format_data
			
 
				+        formatted_data = formatter.format_data(self.conversations)
			
 
				+        self.assertIsNotNone(formatted_data)
			
 
				+        self.assertEqual(len(formatted_data), len(self.conversations))
			
 
				+
			
 
				+        # Test format_conversation
			
 
				+        formatted_conversation = formatter.format_conversation(self.conversations[0])
			
 
				+        self.assertIsInstance(formatted_conversation, dict)
			
 
				+        self.assertIn("messages", formatted_conversation)
			
 
				+
			
 
				+        # Test format_message
			
 
				+        message = self.conversations[0].messages[0]
			
 
				+        formatted_message = formatter.format_message(message)
			
 
				+        self.assertIsInstance(formatted_message, dict)
			
 
				+        self.assertIn("role", formatted_message)
			
 
				+        self.assertIn("content", formatted_message)
			
 
				+
			
 
				+    def test_vllm_formatter(self):
			
 
				+        """Test the vLLMFormatter."""
			
 
				+        formatter = vLLMFormatter()
			
 
				+
			
 
				+        # Test format_data
			
 
				+        formatted_data = formatter.format_data(self.conversations)
			
 
				+        self.assertIsNotNone(formatted_data)
			
 
				+        self.assertEqual(len(formatted_data), len(self.conversations))
			
 
				+
			
 
				+        # Test format_conversation
			
 
				+        formatted_conversation = formatter.format_conversation(self.conversations[0])
			
 
				+        self.assertIsInstance(formatted_conversation, str)
			
 
				+
			
 
				+        # Test format_message
			
 
				+        message = self.conversations[0].messages[0]
			
 
				+        formatted_message = formatter.format_message(message)
			
 
				+        self.assertIsInstance(formatted_message, str)
			
 
				+        self.assertIn(message["role"], formatted_message)
			
 
				+
			
 
				+    def test_openai_formatter(self):
			
 
				+        """Test the OpenAIFormatter."""
			
 
				+        formatter = OpenAIFormatter()
			
 
				+
			
 
				+        # Test format_data
			
 
				+        formatted_data = formatter.format_data(self.conversations)
			
 
				+        self.assertIsNotNone(formatted_data)
			
 
				+        self.assertEqual(len(formatted_data), len(self.conversations))
			
 
				+
			
 
				+        # Test format_conversation
			
 
				+        formatted_conversation = formatter.format_conversation(self.conversations[0])
			
 
				+        self.assertIsInstance(formatted_conversation, dict)
			
 
				+        self.assertIn("messages", formatted_conversation)
			
 
				+
			
 
				+        # Test format_message
			
 
				+        message = self.conversations[0].messages[0]
			
 
				+        formatted_message = formatter.format_message(message)
			
 
				+        self.assertIsInstance(formatted_message, dict)
			
 
				+        self.assertIn("role", formatted_message)
			
 
				+        self.assertIn("content", formatted_message)
			
 
				+
			
 
				+    def test_format_data_function(self):
			
 
				+        """Test the format_data function from data_loader."""
			
 
				+        # Test with TorchtuneFormatter
			
 
				+        torchtune_data = format_data(self.data, "torchtune", self.column_mapping)
			
 
				+        self.assertIsNotNone(torchtune_data)
			
 
				+        self.assertEqual(len(torchtune_data), len(self.data))
			
 
				+
			
 
				+        # Test with vLLMFormatter
			
 
				+        vllm_data = format_data(self.data, "vllm", self.column_mapping)
			
 
				+        self.assertIsNotNone(vllm_data)
			
 
				+        self.assertEqual(len(vllm_data), len(self.data))
			
 
				+
			
 
				+        # Test with OpenAIFormatter
			
 
				+        openai_data = format_data(self.data, "openai", self.column_mapping)
			
 
				+        self.assertIsNotNone(openai_data)
			
 
				+        self.assertEqual(len(openai_data), len(self.data))
			
 
				+
			
 
				+    def test_with_mock_data(self):
			
 
				+        """Test the formatter pipeline with mock data."""
			
 
				+        # Create mock data that mimics a dataset
			
 
				+        mock_data = [
			
 
				+            {
			
 
				+                "question": "What is the capital of France?",
			
 
				+                "context": "France is a country in Western Europe. Its capital is Paris.",
			
 
				+                "answer": "Paris",
			
 
				+            },
			
 
				+            {
			
 
				+                "question": "Who wrote Hamlet?",
			
 
				+                "context": "Hamlet is a tragedy written by William Shakespeare.",
			
 
				+                "answer": "William Shakespeare",
			
 
				+            },
			
 
				+            {
			
 
				+                "question": "What is the largest planet in our solar system?",
			
 
				+                "context": "Jupiter is the largest planet in our solar system.",
			
 
				+                "answer": "Jupiter",
			
 
				+            },
			
 
				+        ]
			
 
				+
			
 
				+        # Create a column mapping for the mock data
			
 
				+        column_mapping = {"input": "context", "output": "answer"}
			
 
				+
			
 
				+        # Convert to conversations
			
 
				+        conversations = convert_to_conversations(mock_data, column_mapping)
			
 
				+
			
 
				+        # Test that conversations are created correctly
			
 
				+        self.assertEqual(len(conversations), len(mock_data))
			
 
				+        for i, conversation in enumerate(conversations):
			
 
				+            self.assertEqual(len(conversation.messages), 2)
			
 
				+            self.assertEqual(conversation.messages[0]["role"], "user")
			
 
				+            self.assertEqual(conversation.messages[1]["role"], "assistant")
			
 
				+
			
 
				+            # Check content of user message
			
 
				+            user_content = conversation.messages[0]["content"]
			
 
				+            self.assertTrue(isinstance(user_content, list))
			
 
				+            self.assertEqual(user_content[0]["type"], "text")
			
 
				+            self.assertEqual(user_content[0]["text"], mock_data[i]["context"])
			
 
				+
			
 
				+            # Check content of assistant message
			
 
				+            assistant_content = conversation.messages[1]["content"]
			
 
				+            self.assertTrue(isinstance(assistant_content, list))
			
 
				+            self.assertEqual(assistant_content[0]["type"], "text")
			
 
				+            self.assertEqual(assistant_content[0]["text"], mock_data[i]["answer"])
			
 
				+
			
 
				+        # Test each formatter with the mock data
			
 
				+        formatters = {
			
 
				+            "torchtune": TorchtuneFormatter(),
			
 
				+            "vllm": vLLMFormatter(),
			
 
				+            "openai": OpenAIFormatter(),
			
 
				+        }
			
 
				+
			
 
				+        for name, formatter in formatters.items():
			
 
				+            formatted_data = formatter.format_data(conversations)
			
 
				+            self.assertEqual(len(formatted_data), len(mock_data))
			
 
				+
			
 
				+            # Test the first formatted item
			
 
				+            if name == "vllm":
			
 
				+                # vLLM formatter returns strings
			
 
				+                self.assertTrue(isinstance(formatted_data[0], str))
			
 
				+                self.assertIn("user:", formatted_data[0])
			
 
				+                self.assertIn("assistant:", formatted_data[0])
			
 
				+            else:
			
 
				+                # Torchtune and OpenAI formatters return dicts
			
 
				+                self.assertTrue(isinstance(formatted_data[0], dict))
			
 
				+                self.assertIn("messages", formatted_data[0])
			
 
				+                self.assertEqual(len(formatted_data[0]["messages"]), 2)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # If run as a script, this allows passing a dataset name as an argument
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Test the formatter module with a specific dataset"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--dataset",
			
 
				+        type=str,
			
 
				+        default="dz-osamu/IU-Xray",
			
 
				+        help="Name of the Hugging Face dataset to use for testing",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--split",
			
 
				+        type=str,
			
 
				+        default="train[:10]",
			
 
				+        help="Dataset split to use (e.g., 'train[:10]', 'validation[:10]')",
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Override the default dataset in the test class
			
 
				+    TestFormatter.dataset_name = args.dataset
			
 
				+    TestFormatter.split = args.split
			
 
				+
			
 
				+    # Run the tests
			
 
				+    unittest.main(argv=["first-arg-is-ignored"])