瀏覽代碼

updated vllm params and start server

Ubuntu 1 月之前
父節點
當前提交
b458c21c03

+ 6 - 8
src/finetune_pipeline/config.yaml

@@ -21,21 +21,19 @@ formatter:
 
 # Training configuration
 finetuning:
-  strategy: "fft"               # Training strategy ('fft' or 'lora')
+  strategy: "lora"               # Training strategy ('fft' or 'lora')
   num_epochs: 1                 # Number of training epochs
   batch_size: 1                 # Batch size per device for training
   torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
-  num_processes_per_node: 1             # TorchTune-specific configuration
-  distributed: false             # Whether to use distributed training
+  num_processes_per_node: 8             # TorchTune-specific configuration
+  distributed: true             # Whether to use distributed training
 
 
 # vLLM Inference configuration
 inference:
   # Model configuration
-  model_path: "your/model/path" # Path to the model checkpoint
+  model_path: "/home/ubuntu/yash-workspace/medgemma-4b-it" # Path to the model checkpoint
   quantization: null            # Quantization method (awq, gptq, squeezellm)
-  dtype: "auto"                 # Data type for model weights (half, float, bfloat16, auto)
-  trust_remote_code: false      # Trust remote code when loading the model
 
   # Server configuration
   port: 8000                    # Port to run the server on
@@ -43,8 +41,8 @@ inference:
 
   # Performance configuration
   tensor_parallel_size: 1       # Number of GPUs to use for tensor parallelism
-  max_model_len: 1024           # Maximum sequence length
-  max_num_seqs: 16              # Maximum number of sequences
+  max_model_len: 32           # Maximum sequence length
+  max_num_seqs: 1              # Maximum number of sequences
   gpu_memory_utilization: 0.9   # Fraction of GPU memory to use
   enforce_eager: false          # Enforce eager execution
 

+ 18 - 18
src/finetune_pipeline/inference/__init__.py

@@ -4,22 +4,22 @@ Inference utilities for LLMs.
 This module provides tools for running inference with fine-tuned models.
 """
 
-from .inference import (
-    run_inference_from_config,
-    run_inference_on_eval_data,
-    VLLMClient,
-    VLLMInferenceRequest,
-)
-from .start_vllm_server import check_vllm_installed, read_config, start_vllm_server
+# from .inference import (
+#     run_inference_from_config,
+#     run_inference_on_eval_data,
+#     VLLMClient,
+#     VLLMInferenceRequest,
+# )
+# from .start_vllm_server import check_vllm_installed, read_config, start_vllm_server
 
-__all__ = [
-    # From inference
-    "VLLMClient",
-    "VLLMInferenceRequest",
-    "run_inference_on_eval_data",
-    "run_inference_from_config",
-    # From start_vllm_server
-    "start_vllm_server",
-    "read_config",
-    "check_vllm_installed",
-]
+# __all__ = [
+#     # From inference
+#     "VLLMClient",
+#     "VLLMInferenceRequest",
+#     "run_inference_on_eval_data",
+#     "run_inference_from_config",
+#     # From start_vllm_server
+#     "start_vllm_server",
+#     "read_config",
+#     "check_vllm_installed",
+# ]

+ 0 - 34
src/finetune_pipeline/inference/start_vllm_server.py

@@ -101,9 +101,7 @@ def start_vllm_server(
     max_model_len: int = 4096,
     max_num_seqs: int = 256,
     quantization: Optional[str] = None,
-    dtype: str = "auto",
     gpu_memory_utilization: float = 0.9,
-    trust_remote_code: bool = False,
     enforce_eager: bool = False,
     additional_args: Optional[Dict] = None,
 ) -> None:
@@ -145,27 +143,14 @@ def start_vllm_server(
     cmd.extend(["--max-model-len", str(max_model_len)])
     cmd.extend(["--max-num-seqs", str(max_num_seqs)])
     cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
-    cmd.extend(["--dtype", dtype])
 
     # Add optional parameters
     if quantization:
         cmd.extend(["--quantization", quantization])
 
-    if trust_remote_code:
-        cmd.append("--trust-remote-code")
-
     if enforce_eager:
         cmd.append("--enforce-eager")
 
-    # Add additional arguments
-    if additional_args:
-        for key, value in additional_args.items():
-            if isinstance(value, bool):
-                if value:
-                    cmd.append(f"--{key}")
-            else:
-                cmd.extend([f"--{key}", str(value)])
-
     # Log the command
     logger.info(f"Starting vLLM server with command: {' '.join(cmd)}")
 
@@ -223,18 +208,6 @@ def main():
         choices=["awq", "gptq", "squeezellm"],
         help="Quantization method to use",
     )
-    model_group.add_argument(
-        "--dtype",
-        type=str,
-        default="auto",
-        choices=["half", "float", "bfloat16", "auto"],
-        help="Data type for model weights",
-    )
-    model_group.add_argument(
-        "--trust-remote-code",
-        action="store_true",
-        help="Trust remote code when loading the model",
-    )
 
     # Server options
     server_group = parser.add_argument_group("Server")
@@ -344,11 +317,6 @@ def main():
             else inference_config.get("max_num_seqs", args.max_num_seqs)
         ),
         "quantization": args.quantization or inference_config.get("quantization"),
-        "dtype": (
-            args.dtype
-            if args.dtype != parser.get_default("dtype")
-            else inference_config.get("dtype", args.dtype)
-        ),
         "gpu_memory_utilization": (
             args.gpu_memory_utilization
             if args.gpu_memory_utilization
@@ -357,8 +325,6 @@ def main():
                 "gpu_memory_utilization", args.gpu_memory_utilization
             )
         ),
-        "trust_remote_code": args.trust_remote_code
-        or inference_config.get("trust_remote_code", False),
         "enforce_eager": args.enforce_eager
         or inference_config.get("enforce_eager", False),
     }