Procházet zdrojové kódy

Update annotating.py

Sanyam Bhutani před 5 měsíci
rodič
revize
d2db35bfbc

+ 88 - 0
end-to-end-use-cases/data-tool/dataprep-scripts/annotating.py

@@ -109,3 +109,91 @@ def process_dataset(
                 "conversations": output.outputs[0].text,
             }
             f.write(json.dumps(result) + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process dataset using vLLM with multi-GPU support"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Name or path of the model to use"
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to YAML config file containing system prompt",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="processed_outputs.jsonl",
+        help="Output file path",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, required=True, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--gpu-ids",
+        type=str,
+        help="Comma-separated list of GPU IDs to use (e.g., '0,1,2,3')",
+    )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Number of GPUs to use for tensor parallelism",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.9,
+        help="Target GPU memory utilization (0.0 to 1.0)",
+    )
+    # Add new arguments for range specification
+    parser.add_argument(
+        "--start-index",
+        type=int,
+        default=0,
+        help="Starting index in the dataset (inclusive)",
+    )
+    parser.add_argument(
+        "--end-index",
+        type=int,
+        help="Ending index in the dataset (exclusive). If not specified, processes until the end.",
+    )
+    args = parser.parse_args()
+
+    # Parse GPU IDs if provided
+    gpu_ids = None
+    if args.gpu_ids:
+        gpu_ids = [int(gpu_id) for gpu_id in args.gpu_ids.split(",")]
+
+    # Load system prompt from YAML
+    system_prompt = load_system_prompt(args.config)
+
+    # Load dataset
+    dataset = load_from_disk(args.dataset_path)
+    dataset = dataset.select(range(0, 2000))
+
+    # Initialize vLLM with multi-GPU support
+    llm = setup_llm(
+        model_name=args.model,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        gpu_ids=gpu_ids,
+    )
+
+    # Process dataset
+    process_dataset(
+        dataset=dataset,
+        llm=llm,
+        system_prompt=system_prompt,
+        output_file=args.output_file,
+        start_index=args.start_index,
+        end_index=args.end_index,
+    )
+
+
+if __name__ == "__main__":
+    main()