瀏覽代碼

Update on-prem vllm scripts and readme

Chester Hu 10 月之前
父節點
當前提交
2828fd0201

+ 2 - 0
recipes/benchmarks/inference_throughput/on-prem/README.md

@@ -37,3 +37,5 @@ To run pretrained model benchmark, follow the command below.
 ```
 python pretrained_vllm_benchmark.py
 ```
+
+Refer to more vLLM benchmark details on their official Github repo [here](https://github.com/vllm-project/vllm/tree/main/benchmarks).

+ 1 - 2
recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py

@@ -4,7 +4,6 @@
 import csv
 import json
 import time
-import random
 import threading
 import numpy as np
 import requests
@@ -18,7 +17,7 @@ from azure.core.exceptions import HttpResponseError
 from azure.ai.contentsafety.models import AnalyzeTextOptions
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Tuple, List
+from typing import Tuple, List
 
 
 

+ 1 - 1
recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json

@@ -1,7 +1,7 @@
 {
     "MAX_NEW_TOKENS" : 256,
     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
-    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
+    "MODEL_PATH" : "meta-llama/your-model-path",
     "MODEL_HEADERS" : {"Content-Type": "application/json"},
     "SAFE_CHECK" : true,
     "THRESHOLD_TPS" : 7,

+ 1 - 1
recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py

@@ -18,7 +18,7 @@ from azure.core.exceptions import HttpResponseError
 from azure.ai.contentsafety.models import AnalyzeTextOptions
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Tuple, List
+from typing import Tuple, List
 
 
 # Predefined inputs