radu
/
LLamaRecipes
zrkadlo https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
							#!/usr/bin/env python
"""
Script to start a vLLM server for inference.

This script provides a convenient way to start a vLLM server with various configuration options.
It supports loading models from local paths or Hugging Face model IDs.

Example usage:
    python start_vllm_server.py --model-path meta-llama/Llama-2-7b-chat-hf
    python start_vllm_server.py --model-path /path/to/local/model --port 8080
    python start_vllm_server.py --config /path/to/config.yaml
    python start_vllm_server.py  # Uses the default config.yaml in the parent directory
"""

import argparse
import json
import logging
import os
import subprocess
import sys
from pathlib import Path
from typing import Dict, Optional, Union

# Configure logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Try to import yaml for config file support
try:
    import yaml

    HAS_YAML = True
except ImportError:
    HAS_YAML = False
    logger.warning("PyYAML not installed. Config file support limited to JSON format.")


def read_config(config_path: str) -> Dict:
    """
    Read the configuration file (supports both JSON and YAML formats).

    Args:
        config_path: Path to the configuration file

    Returns:
        dict: Configuration parameters

    Raises:
        ValueError: If the file format is not supported
        ImportError: If the required package for the file format is not installed
    """
    file_extension = Path(config_path).suffix.lower()

    with open(config_path, "r") as f:
        if file_extension in [".json"]:
            config = json.load(f)
        elif file_extension in [".yaml", ".yml"]:
            if not HAS_YAML:
                raise ImportError(
                    "The 'pyyaml' package is required to load YAML files. "
                    "Please install it with 'pip install pyyaml'."
                )
            config = yaml.safe_load(f)
        else:
            raise ValueError(
                f"Unsupported config file format: {file_extension}. "
                f"Supported formats are: .json, .yaml, .yml"
            )

    return config


def check_vllm_installed() -> bool:
    """
    Check if vLLM is installed.

    Returns:
        bool: True if vLLM is installed, False otherwise
    """
    try:
        subprocess.run(
            ["vllm", "--help"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        return True
    except FileNotFoundError:
        return False


def start_vllm_server(
    model_path: str,
    port: int = 8000,
    host: str = "0.0.0.0",
    tensor_parallel_size: int = 1,
    max_model_len: int = 4096,
    max_num_seqs: int = 256,
    quantization: Optional[str] = None,
    gpu_memory_utilization: float = 0.9,
    enforce_eager: bool = False,
    additional_args: Optional[Dict] = None,
) -> None:
    """
    Start a vLLM server with the specified parameters.

    Args:
        model_path: Path to the model or Hugging Face model ID
        port: Port to run the server on
        host: Host to run the server on
        tensor_parallel_size: Number of GPUs to use for tensor parallelism
        max_model_len: Maximum sequence length
        max_num_seqs: Maximum number of sequences
        quantization: Quantization method (e.g., "awq", "gptq", "squeezellm")
        dtype: Data type for model weights (e.g., "half", "float", "bfloat16", "auto")
        gpu_memory_utilization: Fraction of GPU memory to use
        trust_remote_code: Whether to trust remote code when loading the model
        enforce_eager: Whether to enforce eager execution
        additional_args: Additional arguments to pass to vLLM

    Raises:
        subprocess.CalledProcessError: If the vLLM server fails to start
        FileNotFoundError: If vLLM is not installed
    """
    # Check if vLLM is installed
    if not check_vllm_installed():
        logger.error(
            "vLLM is not installed. Please install it with 'pip install vllm'."
        )
        sys.exit(1)

    # Build the command
    cmd = ["vllm", "serve", model_path]

    # Add basic parameters
    cmd.extend(["--port", str(port)])
    cmd.extend(["--host", host])
    cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
    cmd.extend(["--max-model-len", str(max_model_len)])
    cmd.extend(["--max-num-seqs", str(max_num_seqs)])
    cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])

    # Add optional parameters
    if quantization:
        cmd.extend(["--quantization", quantization])

    if enforce_eager:
        cmd.append("--enforce-eager")

    # Log the command
    logger.info(f"Starting vLLM server with command: {' '.join(cmd)}")

    # Run the command
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        logger.error(f"Failed to start vLLM server: {e}")
        sys.exit(1)
    except KeyboardInterrupt:
        logger.info("vLLM server stopped by user.")
        sys.exit(0)


def find_config_file():
    """
    Find the config.yaml file in the parent directory.

    Returns:
        str: Path to the config file
    """
    # Try to find the config file in the parent directory
    script_dir = Path(__file__).resolve().parent
    parent_dir = script_dir.parent
    config_path = parent_dir / "config.yaml"

    if config_path.exists():
        return str(config_path)
    else:
        return None


def main():
    """Main function."""
    parser = argparse.ArgumentParser(description="Start a vLLM server for inference")

    # Configuration options
    config_group = parser.add_argument_group("Configuration")
    config_group.add_argument(
        "--config",
        type=str,
        help="Path to a configuration file (JSON or YAML)",
    )

    # Model options
    model_group = parser.add_argument_group("Model")
    model_group.add_argument(
        "--model-path",
        type=str,
        help="Path to the model or Hugging Face model ID",
    )
    model_group.add_argument(
        "--quantization",
        type=str,
        choices=["awq", "gptq", "squeezellm"],
        help="Quantization method to use",
    )

    # Server options
    server_group = parser.add_argument_group("Server")
    server_group.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port to run the server on",
    )
    server_group.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to run the server on",
    )

    # Performance options
    perf_group = parser.add_argument_group("Performance")
    perf_group.add_argument(
        "--tensor-parallel-size",
        type=int,
        default=1,
        help="Number of GPUs to use for tensor parallelism",
    )
    perf_group.add_argument(
        "--max-model-len",
        type=int,
        default=4096,
        help="Maximum sequence length",
    )
    perf_group.add_argument(
        "--max-num-seqs",
        type=int,
        default=256,
        help="Maximum number of sequences",
    )
    perf_group.add_argument(
        "--gpu-memory-utilization",
        type=float,
        default=0.9,
        help="Fraction of GPU memory to use",
    )
    perf_group.add_argument(
        "--enforce-eager",
        action="store_true",
        help="Enforce eager execution",
    )

    args = parser.parse_args()

    # Load config file
    config = {}
    config_path = args.config

    # If no config file is provided, try to find the default one
    if not config_path:
        config_path = find_config_file()
        if config_path:
            logger.info(f"Using default config file: {config_path}")

    if config_path:
        try:
            config = read_config(config_path)
            logger.info(f"Loaded configuration from {config_path}")
        except Exception as e:
            logger.error(f"Failed to load configuration from {config_path}: {e}")
            sys.exit(1)

    # Extract inference section from config if it exists
    inference_config = config.get("inference", {})

    # Merge command-line arguments with config file
    # Command-line arguments take precedence
    model_path = args.model_path or inference_config.get("model_path")
    if not model_path:
        logger.error(
            "Model path must be provided either via --model-path or in the config file under inference.model_path"
        )
        sys.exit(1)

    # Extract parameters
    params = {
        "model_path": model_path,
        "port": (
            args.port
            if args.port != parser.get_default("port")
            else inference_config.get("port", args.port)
        ),
        "host": (
            args.host
            if args.host != parser.get_default("host")
            else inference_config.get("host", args.host)
        ),
        "tensor_parallel_size": (
            args.tensor_parallel_size
            if args.tensor_parallel_size != parser.get_default("tensor_parallel_size")
            else inference_config.get("tensor_parallel_size", args.tensor_parallel_size)
        ),
        "max_model_len": (
            args.max_model_len
            if args.max_model_len != parser.get_default("max_model_len")
            else inference_config.get("max_model_len", args.max_model_len)
        ),
        "max_num_seqs": (
            args.max_num_seqs
            if args.max_num_seqs != parser.get_default("max_num_seqs")
            else inference_config.get("max_num_seqs", args.max_num_seqs)
        ),
        "quantization": args.quantization or inference_config.get("quantization"),
        "gpu_memory_utilization": (
            args.gpu_memory_utilization
            if args.gpu_memory_utilization
            != parser.get_default("gpu_memory_utilization")
            else inference_config.get(
                "gpu_memory_utilization", args.gpu_memory_utilization
            )
        ),
        "enforce_eager": args.enforce_eager
        or inference_config.get("enforce_eager", False),
    }

    # Get additional arguments from inference config
    additional_args = {k: v for k, v in inference_config.items() if k not in params}
    if additional_args:
        params["additional_args"] = additional_args

    # Start the vLLM server
    start_vllm_server(**params)


if __name__ == "__main__":
    main()