1 月之前 · b458c21c03
--- a/src/finetune_pipeline/config.yaml
+++ b/src/finetune_pipeline/config.yaml
@@ -21,21 +21,19 @@ formatter:
 
				 
			
 
				 # Training configuration
			
 
				 finetuning:
			
 
				-  strategy: "fft"               # Training strategy ('fft' or 'lora')
			
 
				+  strategy: "lora"               # Training strategy ('fft' or 'lora')
			
 
				   num_epochs: 1                 # Number of training epochs
			
 
				   batch_size: 1                 # Batch size per device for training
			
 
				   torchtune_config: "llama3_2_vision/11B_lora"             # TorchTune-specific configuration
			
 
				-  num_processes_per_node: 1             # TorchTune-specific configuration
			
 
				-  distributed: false             # Whether to use distributed training
			
 
				+  num_processes_per_node: 8             # TorchTune-specific configuration
			
 
				+  distributed: true             # Whether to use distributed training
			
 
				 
			
 
				 
			
 
				 # vLLM Inference configuration
			
 
				 inference:
			
 
				   # Model configuration
			
 
				-  model_path: "your/model/path" # Path to the model checkpoint
			
 
				+  model_path: "/home/ubuntu/yash-workspace/medgemma-4b-it" # Path to the model checkpoint
			
 
				   quantization: null            # Quantization method (awq, gptq, squeezellm)
			
 
				-  dtype: "auto"                 # Data type for model weights (half, float, bfloat16, auto)
			
 
				-  trust_remote_code: false      # Trust remote code when loading the model
			
 
				 
			
 
				   # Server configuration
			
 
				   port: 8000                    # Port to run the server on
			
@@ -43,8 +41,8 @@ inference:
 
				 
			
 
				   # Performance configuration
			
 
				   tensor_parallel_size: 1       # Number of GPUs to use for tensor parallelism
			
 
				-  max_model_len: 1024           # Maximum sequence length
			
 
				-  max_num_seqs: 16              # Maximum number of sequences
			
 
				+  max_model_len: 32           # Maximum sequence length
			
 
				+  max_num_seqs: 1              # Maximum number of sequences
			
 
				   gpu_memory_utilization: 0.9   # Fraction of GPU memory to use
			
 
				   enforce_eager: false          # Enforce eager execution
			
 
				 
			
--- a/src/finetune_pipeline/inference/__init__.py
+++ b/src/finetune_pipeline/inference/__init__.py
@@ -4,22 +4,22 @@ Inference utilities for LLMs.
 
				 This module provides tools for running inference with fine-tuned models.
			
 
				 """
			
 
				 
			
 
				-from .inference import (
			
 
				-    run_inference_from_config,
			
 
				-    run_inference_on_eval_data,
			
 
				-    VLLMClient,
			
 
				-    VLLMInferenceRequest,
			
 
				-)
			
 
				-from .start_vllm_server import check_vllm_installed, read_config, start_vllm_server
			
 
				+# from .inference import (
			
 
				+#     run_inference_from_config,
			
 
				+#     run_inference_on_eval_data,
			
 
				+#     VLLMClient,
			
 
				+#     VLLMInferenceRequest,
			
 
				+# )
			
 
				+# from .start_vllm_server import check_vllm_installed, read_config, start_vllm_server
			
 
				 
			
 
				-__all__ = [
			
 
				-    # From inference
			
 
				-    "VLLMClient",
			
 
				-    "VLLMInferenceRequest",
			
 
				-    "run_inference_on_eval_data",
			
 
				-    "run_inference_from_config",
			
 
				-    # From start_vllm_server
			
 
				-    "start_vllm_server",
			
 
				-    "read_config",
			
 
				-    "check_vllm_installed",
			
 
				-]
			
 
				+# __all__ = [
			
 
				+#     # From inference
			
 
				+#     "VLLMClient",
			
 
				+#     "VLLMInferenceRequest",
			
 
				+#     "run_inference_on_eval_data",
			
 
				+#     "run_inference_from_config",
			
 
				+#     # From start_vllm_server
			
 
				+#     "start_vllm_server",
			
 
				+#     "read_config",
			
 
				+#     "check_vllm_installed",
			
 
				+# ]
			
--- a/src/finetune_pipeline/inference/start_vllm_server.py
+++ b/src/finetune_pipeline/inference/start_vllm_server.py
@@ -101,9 +101,7 @@ def start_vllm_server(
 
				     max_model_len: int = 4096,
			
 
				     max_num_seqs: int = 256,
			
 
				     quantization: Optional[str] = None,
			
 
				-    dtype: str = "auto",
			
 
				     gpu_memory_utilization: float = 0.9,
			
 
				-    trust_remote_code: bool = False,
			
 
				     enforce_eager: bool = False,
			
 
				     additional_args: Optional[Dict] = None,
			
 
				 ) -> None:
			
@@ -145,27 +143,14 @@ def start_vllm_server(
 
				     cmd.extend(["--max-model-len", str(max_model_len)])
			
 
				     cmd.extend(["--max-num-seqs", str(max_num_seqs)])
			
 
				     cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
			
 
				-    cmd.extend(["--dtype", dtype])
			
 
				 
			
 
				     # Add optional parameters
			
 
				     if quantization:
			
 
				         cmd.extend(["--quantization", quantization])
			
 
				 
			
 
				-    if trust_remote_code:
			
 
				-        cmd.append("--trust-remote-code")
			
 
				-
			
 
				     if enforce_eager:
			
 
				         cmd.append("--enforce-eager")
			
 
				 
			
 
				-    # Add additional arguments
			
 
				-    if additional_args:
			
 
				-        for key, value in additional_args.items():
			
 
				-            if isinstance(value, bool):
			
 
				-                if value:
			
 
				-                    cmd.append(f"--{key}")
			
 
				-            else:
			
 
				-                cmd.extend([f"--{key}", str(value)])
			
 
				-
			
 
				     # Log the command
			
 
				     logger.info(f"Starting vLLM server with command: {' '.join(cmd)}")
			
 
				 
			
@@ -223,18 +208,6 @@ def main():
 
				         choices=["awq", "gptq", "squeezellm"],
			
 
				         help="Quantization method to use",
			
 
				     )
			
 
				-    model_group.add_argument(
			
 
				-        "--dtype",
			
 
				-        type=str,
			
 
				-        default="auto",
			
 
				-        choices=["half", "float", "bfloat16", "auto"],
			
 
				-        help="Data type for model weights",
			
 
				-    )
			
 
				-    model_group.add_argument(
			
 
				-        "--trust-remote-code",
			
 
				-        action="store_true",
			
 
				-        help="Trust remote code when loading the model",
			
 
				-    )
			
 
				 
			
 
				     # Server options
			
 
				     server_group = parser.add_argument_group("Server")
			
@@ -344,11 +317,6 @@ def main():
 
				             else inference_config.get("max_num_seqs", args.max_num_seqs)
			
 
				         ),
			
 
				         "quantization": args.quantization or inference_config.get("quantization"),
			
 
				-        "dtype": (
			
 
				-            args.dtype
			
 
				-            if args.dtype != parser.get_default("dtype")
			
 
				-            else inference_config.get("dtype", args.dtype)
			
 
				-        ),
			
 
				         "gpu_memory_utilization": (
			
 
				             args.gpu_memory_utilization
			
 
				             if args.gpu_memory_utilization
			
@@ -357,8 +325,6 @@ def main():
 
				                 "gpu_memory_utilization", args.gpu_memory_utilization
			
 
				             )
			
 
				         ),
			
 
				-        "trust_remote_code": args.trust_remote_code
			
 
				-        or inference_config.get("trust_remote_code", False),
			
 
				         "enforce_eager": args.enforce_eager
			
 
				         or inference_config.get("enforce_eager", False),
			
 
				     }