10 月之前 · 3e0941079e
--- a/src/finetune_pipeline/config.yaml
+++ b/src/finetune_pipeline/config.yaml
@@ -26,4 +26,30 @@ finetuning:
 
				   batch_size: 1                 # Batch size per device for training
			
 
				   torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
			
 
				   num_processes_per_node: 8             # TorchTune-specific configuration
			
 
				-  distributed: true             # Whether to use distributed training
			
 
				+  distributed: true             # Whether to use distributed training
			
 
				+
			
 
				+
			
 
				+# vLLM Inference configuration
			
 
				+inference:
			
 
				+  # Model configuration
			
 
				+  model_path: "your/model/path" # Path to the model checkpoint
			
 
				+  quantization: null            # Quantization method (awq, gptq, squeezellm)
			
 
				+  dtype: "auto"                 # Data type for model weights (half, float, bfloat16, auto)
			
 
				+  trust_remote_code: false      # Trust remote code when loading the model
			
 
				+
			
 
				+  # Server configuration
			
 
				+  port: 8000                    # Port to run the server on
			
 
				+  host: "0.0.0.0"               # Host to run the server on
			
 
				+
			
 
				+  # Performance configuration
			
 
				+  tensor_parallel_size: 1       # Number of GPUs to use for tensor parallelism
			
 
				+  max_model_len: 1024           # Maximum sequence length
			
 
				+  max_num_seqs: 16              # Maximum number of sequences
			
 
				+  gpu_memory_utilization: 0.9   # Fraction of GPU memory to use
			
 
				+  enforce_eager: false          # Enforce eager execution
			
 
				+
			
 
				+  # Additional vLLM parameters (optional)
			
 
				+  # swap_space: 4               # Size of CPU swap space in GiB
			
 
				+  # block_size: 16              # Size of blocks used in the KV cache
			
 
				+  # disable_log_stats: true     # Disable logging of stats
			
 
				+  # disable_log_requests: false # Disable logging of requests
			
--- a/src/finetune_pipeline/finetuning/run_finetuning.py
+++ b/src/finetune_pipeline/finetuning/run_finetuning.py
--- a/src/finetune_pipeline/inference/run_inference.py
+++ b/src/finetune_pipeline/inference/run_inference.py
@@ -0,0 +1,27 @@
 
				+"""
			
 
				+Script to run the model inference.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+from typing import Any, Dict, List, Optional, TypedDict, Union
			
 
				+
			
 
				+from vllm import SamplingParams
			
 
				+
			
 
				+
			
 
				+class InferenceRequest(TypedDict, total=False):
			
 
				+    """Type definition for LLM inference request."""
			
 
				+
			
 
				+    model: str
			
 
				+    messages: List[Message]
			
 
				+    temperature: float
			
 
				+    top_p: float
			
 
				+    max_completion_tokens: int
			
 
				+    seed: int
			
 
				+    response_format: Optional[Dict[str, Any]]
			
 
				+
			
 
				+
			
 
				+class VLLMInferenceRequest(TypedDict):
			
 
				+    """Type definition for VLLM inference request format."""
			
 
				+
			
 
				+    messages: List[List[Message]]
			
 
				+    sampling_params: Union[SamplingParams, List[SamplingParams]]
			
--- a/src/finetune_pipeline/inference/start_vllm_server.py
+++ b/src/finetune_pipeline/inference/start_vllm_server.py
@@ -0,0 +1,376 @@
 
				+#!/usr/bin/env python
			
 
				+"""
			
 
				+Script to start a vLLM server for inference.
			
 
				+
			
 
				+This script provides a convenient way to start a vLLM server with various configuration options.
			
 
				+It supports loading models from local paths or Hugging Face model IDs.
			
 
				+
			
 
				+Example usage:
			
 
				+    python start_vllm_server.py --model-path meta-llama/Llama-2-7b-chat-hf
			
 
				+    python start_vllm_server.py --model-path /path/to/local/model --port 8080
			
 
				+    python start_vllm_server.py --config /path/to/config.yaml
			
 
				+    python start_vllm_server.py  # Uses the default config.yaml in the parent directory
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import subprocess
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Optional, Union
			
 
				+
			
 
				+# Configure logging
			
 
				+logging.basicConfig(
			
 
				+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
			
 
				+    datefmt="%Y-%m-%d %H:%M:%S",
			
 
				+    level=logging.INFO,
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+# Try to import yaml for config file support
			
 
				+try:
			
 
				+    import yaml
			
 
				+
			
 
				+    HAS_YAML = True
			
 
				+except ImportError:
			
 
				+    HAS_YAML = False
			
 
				+    logger.warning("PyYAML not installed. Config file support limited to JSON format.")
			
 
				+
			
 
				+
			
 
				+def read_config(config_path: str) -> Dict:
			
 
				+    """
			
 
				+    Read the configuration file (supports both JSON and YAML formats).
			
 
				+
			
 
				+    Args:
			
 
				+        config_path: Path to the configuration file
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: Configuration parameters
			
 
				+
			
 
				+    Raises:
			
 
				+        ValueError: If the file format is not supported
			
 
				+        ImportError: If the required package for the file format is not installed
			
 
				+    """
			
 
				+    file_extension = Path(config_path).suffix.lower()
			
 
				+
			
 
				+    with open(config_path, "r") as f:
			
 
				+        if file_extension in [".json"]:
			
 
				+            config = json.load(f)
			
 
				+        elif file_extension in [".yaml", ".yml"]:
			
 
				+            if not HAS_YAML:
			
 
				+                raise ImportError(
			
 
				+                    "The 'pyyaml' package is required to load YAML files. "
			
 
				+                    "Please install it with 'pip install pyyaml'."
			
 
				+                )
			
 
				+            config = yaml.safe_load(f)
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                f"Unsupported config file format: {file_extension}. "
			
 
				+                f"Supported formats are: .json, .yaml, .yml"
			
 
				+            )
			
 
				+
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+def check_vllm_installed() -> bool:
			
 
				+    """
			
 
				+    Check if vLLM is installed.
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: True if vLLM is installed, False otherwise
			
 
				+    """
			
 
				+    try:
			
 
				+        subprocess.run(
			
 
				+            ["vllm", "--help"],
			
 
				+            stdout=subprocess.PIPE,
			
 
				+            stderr=subprocess.PIPE,
			
 
				+            check=False,
			
 
				+        )
			
 
				+        return True
			
 
				+    except FileNotFoundError:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def start_vllm_server(
			
 
				+    model_path: str,
			
 
				+    port: int = 8000,
			
 
				+    host: str = "0.0.0.0",
			
 
				+    tensor_parallel_size: int = 1,
			
 
				+    max_model_len: int = 4096,
			
 
				+    max_num_seqs: int = 256,
			
 
				+    quantization: Optional[str] = None,
			
 
				+    dtype: str = "auto",
			
 
				+    gpu_memory_utilization: float = 0.9,
			
 
				+    trust_remote_code: bool = False,
			
 
				+    enforce_eager: bool = False,
			
 
				+    additional_args: Optional[Dict] = None,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Start a vLLM server with the specified parameters.
			
 
				+
			
 
				+    Args:
			
 
				+        model_path: Path to the model or Hugging Face model ID
			
 
				+        port: Port to run the server on
			
 
				+        host: Host to run the server on
			
 
				+        tensor_parallel_size: Number of GPUs to use for tensor parallelism
			
 
				+        max_model_len: Maximum sequence length
			
 
				+        max_num_seqs: Maximum number of sequences
			
 
				+        quantization: Quantization method (e.g., "awq", "gptq", "squeezellm")
			
 
				+        dtype: Data type for model weights (e.g., "half", "float", "bfloat16", "auto")
			
 
				+        gpu_memory_utilization: Fraction of GPU memory to use
			
 
				+        trust_remote_code: Whether to trust remote code when loading the model
			
 
				+        enforce_eager: Whether to enforce eager execution
			
 
				+        additional_args: Additional arguments to pass to vLLM
			
 
				+
			
 
				+    Raises:
			
 
				+        subprocess.CalledProcessError: If the vLLM server fails to start
			
 
				+        FileNotFoundError: If vLLM is not installed
			
 
				+    """
			
 
				+    # Check if vLLM is installed
			
 
				+    if not check_vllm_installed():
			
 
				+        logger.error(
			
 
				+            "vLLM is not installed. Please install it with 'pip install vllm'."
			
 
				+        )
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # Build the command
			
 
				+    cmd = ["vllm", "serve", model_path]
			
 
				+
			
 
				+    # Add basic parameters
			
 
				+    cmd.extend(["--port", str(port)])
			
 
				+    cmd.extend(["--host", host])
			
 
				+    cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
			
 
				+    cmd.extend(["--max-model-len", str(max_model_len)])
			
 
				+    cmd.extend(["--max-num-seqs", str(max_num_seqs)])
			
 
				+    cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
			
 
				+    cmd.extend(["--dtype", dtype])
			
 
				+
			
 
				+    # Add optional parameters
			
 
				+    if quantization:
			
 
				+        cmd.extend(["--quantization", quantization])
			
 
				+
			
 
				+    if trust_remote_code:
			
 
				+        cmd.append("--trust-remote-code")
			
 
				+
			
 
				+    if enforce_eager:
			
 
				+        cmd.append("--enforce-eager")
			
 
				+
			
 
				+    # Add additional arguments
			
 
				+    if additional_args:
			
 
				+        for key, value in additional_args.items():
			
 
				+            if isinstance(value, bool):
			
 
				+                if value:
			
 
				+                    cmd.append(f"--{key}")
			
 
				+            else:
			
 
				+                cmd.extend([f"--{key}", str(value)])
			
 
				+
			
 
				+    # Log the command
			
 
				+    logger.info(f"Starting vLLM server with command: {' '.join(cmd)}")
			
 
				+
			
 
				+    # Run the command
			
 
				+    try:
			
 
				+        subprocess.run(cmd, check=True)
			
 
				+    except subprocess.CalledProcessError as e:
			
 
				+        logger.error(f"Failed to start vLLM server: {e}")
			
 
				+        sys.exit(1)
			
 
				+    except KeyboardInterrupt:
			
 
				+        logger.info("vLLM server stopped by user.")
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+
			
 
				+def find_config_file():
			
 
				+    """
			
 
				+    Find the config.yaml file in the parent directory.
			
 
				+
			
 
				+    Returns:
			
 
				+        str: Path to the config file
			
 
				+    """
			
 
				+    # Try to find the config file in the parent directory
			
 
				+    script_dir = Path(__file__).resolve().parent
			
 
				+    parent_dir = script_dir.parent
			
 
				+    config_path = parent_dir / "config.yaml"
			
 
				+
			
 
				+    if config_path.exists():
			
 
				+        return str(config_path)
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """Main function."""
			
 
				+    parser = argparse.ArgumentParser(description="Start a vLLM server for inference")
			
 
				+
			
 
				+    # Configuration options
			
 
				+    config_group = parser.add_argument_group("Configuration")
			
 
				+    config_group.add_argument(
			
 
				+        "--config",
			
 
				+        type=str,
			
 
				+        help="Path to a configuration file (JSON or YAML)",
			
 
				+    )
			
 
				+
			
 
				+    # Model options
			
 
				+    model_group = parser.add_argument_group("Model")
			
 
				+    model_group.add_argument(
			
 
				+        "--model-path",
			
 
				+        type=str,
			
 
				+        help="Path to the model or Hugging Face model ID",
			
 
				+    )
			
 
				+    model_group.add_argument(
			
 
				+        "--quantization",
			
 
				+        type=str,
			
 
				+        choices=["awq", "gptq", "squeezellm"],
			
 
				+        help="Quantization method to use",
			
 
				+    )
			
 
				+    model_group.add_argument(
			
 
				+        "--dtype",
			
 
				+        type=str,
			
 
				+        default="auto",
			
 
				+        choices=["half", "float", "bfloat16", "auto"],
			
 
				+        help="Data type for model weights",
			
 
				+    )
			
 
				+    model_group.add_argument(
			
 
				+        "--trust-remote-code",
			
 
				+        action="store_true",
			
 
				+        help="Trust remote code when loading the model",
			
 
				+    )
			
 
				+
			
 
				+    # Server options
			
 
				+    server_group = parser.add_argument_group("Server")
			
 
				+    server_group.add_argument(
			
 
				+        "--port",
			
 
				+        type=int,
			
 
				+        default=8000,
			
 
				+        help="Port to run the server on",
			
 
				+    )
			
 
				+    server_group.add_argument(
			
 
				+        "--host",
			
 
				+        type=str,
			
 
				+        default="0.0.0.0",
			
 
				+        help="Host to run the server on",
			
 
				+    )
			
 
				+
			
 
				+    # Performance options
			
 
				+    perf_group = parser.add_argument_group("Performance")
			
 
				+    perf_group.add_argument(
			
 
				+        "--tensor-parallel-size",
			
 
				+        type=int,
			
 
				+        default=1,
			
 
				+        help="Number of GPUs to use for tensor parallelism",
			
 
				+    )
			
 
				+    perf_group.add_argument(
			
 
				+        "--max-model-len",
			
 
				+        type=int,
			
 
				+        default=4096,
			
 
				+        help="Maximum sequence length",
			
 
				+    )
			
 
				+    perf_group.add_argument(
			
 
				+        "--max-num-seqs",
			
 
				+        type=int,
			
 
				+        default=256,
			
 
				+        help="Maximum number of sequences",
			
 
				+    )
			
 
				+    perf_group.add_argument(
			
 
				+        "--gpu-memory-utilization",
			
 
				+        type=float,
			
 
				+        default=0.9,
			
 
				+        help="Fraction of GPU memory to use",
			
 
				+    )
			
 
				+    perf_group.add_argument(
			
 
				+        "--enforce-eager",
			
 
				+        action="store_true",
			
 
				+        help="Enforce eager execution",
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Load config file
			
 
				+    config = {}
			
 
				+    config_path = args.config
			
 
				+
			
 
				+    # If no config file is provided, try to find the default one
			
 
				+    if not config_path:
			
 
				+        config_path = find_config_file()
			
 
				+        if config_path:
			
 
				+            logger.info(f"Using default config file: {config_path}")
			
 
				+
			
 
				+    if config_path:
			
 
				+        try:
			
 
				+            config = read_config(config_path)
			
 
				+            logger.info(f"Loaded configuration from {config_path}")
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Failed to load configuration from {config_path}: {e}")
			
 
				+            sys.exit(1)
			
 
				+
			
 
				+    # Extract inference section from config if it exists
			
 
				+    inference_config = config.get("inference", {})
			
 
				+
			
 
				+    # Merge command-line arguments with config file
			
 
				+    # Command-line arguments take precedence
			
 
				+    model_path = args.model_path or inference_config.get("model_path")
			
 
				+    if not model_path:
			
 
				+        logger.error(
			
 
				+            "Model path must be provided either via --model-path or in the config file under inference.model_path"
			
 
				+        )
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # Extract parameters
			
 
				+    params = {
			
 
				+        "model_path": model_path,
			
 
				+        "port": (
			
 
				+            args.port
			
 
				+            if args.port != parser.get_default("port")
			
 
				+            else inference_config.get("port", args.port)
			
 
				+        ),
			
 
				+        "host": (
			
 
				+            args.host
			
 
				+            if args.host != parser.get_default("host")
			
 
				+            else inference_config.get("host", args.host)
			
 
				+        ),
			
 
				+        "tensor_parallel_size": (
			
 
				+            args.tensor_parallel_size
			
 
				+            if args.tensor_parallel_size != parser.get_default("tensor_parallel_size")
			
 
				+            else inference_config.get("tensor_parallel_size", args.tensor_parallel_size)
			
 
				+        ),
			
 
				+        "max_model_len": (
			
 
				+            args.max_model_len
			
 
				+            if args.max_model_len != parser.get_default("max_model_len")
			
 
				+            else inference_config.get("max_model_len", args.max_model_len)
			
 
				+        ),
			
 
				+        "max_num_seqs": (
			
 
				+            args.max_num_seqs
			
 
				+            if args.max_num_seqs != parser.get_default("max_num_seqs")
			
 
				+            else inference_config.get("max_num_seqs", args.max_num_seqs)
			
 
				+        ),
			
 
				+        "quantization": args.quantization or inference_config.get("quantization"),
			
 
				+        "dtype": (
			
 
				+            args.dtype
			
 
				+            if args.dtype != parser.get_default("dtype")
			
 
				+            else inference_config.get("dtype", args.dtype)
			
 
				+        ),
			
 
				+        "gpu_memory_utilization": (
			
 
				+            args.gpu_memory_utilization
			
 
				+            if args.gpu_memory_utilization
			
 
				+            != parser.get_default("gpu_memory_utilization")
			
 
				+            else inference_config.get(
			
 
				+                "gpu_memory_utilization", args.gpu_memory_utilization
			
 
				+            )
			
 
				+        ),
			
 
				+        "trust_remote_code": args.trust_remote_code
			
 
				+        or inference_config.get("trust_remote_code", False),
			
 
				+        "enforce_eager": args.enforce_eager
			
 
				+        or inference_config.get("enforce_eager", False),
			
 
				+    }
			
 
				+
			
 
				+    # Get additional arguments from inference config
			
 
				+    additional_args = {k: v for k, v in inference_config.items() if k not in params}
			
 
				+    if additional_args:
			
 
				+        params["additional_args"] = additional_args
			
 
				+
			
 
				+    # Start the vLLM server
			
 
				+    start_vllm_server(**params)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()