2 anos atrás · 2828fd0201
--- a/recipes/benchmarks/inference_throughput/on-prem/README.md
+++ b/recipes/benchmarks/inference_throughput/on-prem/README.md
@@ -37,3 +37,5 @@ To run pretrained model benchmark, follow the command below.
 
				 ```
			
 
				 python pretrained_vllm_benchmark.py
			
 
				 ```
			
 
				+
			
 
				+Refer to more vLLM benchmark details on their official Github repo [here](https://github.com/vllm-project/vllm/tree/main/benchmarks).
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
@@ -4,7 +4,6 @@
 
				 import csv
			
 
				 import json
			
 
				 import time
			
 
				-import random
			
 
				 import threading
			
 
				 import numpy as np
			
 
				 import requests
			
@@ -18,7 +17,7 @@ from azure.core.exceptions import HttpResponseError
 
				 from azure.ai.contentsafety.models import AnalyzeTextOptions
			
 
				 
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from typing import Dict, Tuple, List
			
 
				+from typing import Tuple, List
			
 
				 
			
 
				 
			
 
				 
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
@@ -1,7 +1,7 @@
 
				 {
			
 
				     "MAX_NEW_TOKENS" : 256,
			
 
				     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
			
 
				-    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
			
 
				+    "MODEL_PATH" : "meta-llama/your-model-path",
			
 
				     "MODEL_HEADERS" : {"Content-Type": "application/json"},
			
 
				     "SAFE_CHECK" : true,
			
 
				     "THRESHOLD_TPS" : 7,
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
@@ -18,7 +18,7 @@ from azure.core.exceptions import HttpResponseError
 
				 from azure.ai.contentsafety.models import AnalyzeTextOptions
			
 
				 
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from typing import Dict, Tuple, List
			
 
				+from typing import Tuple, List
			
 
				 
			
 
				 
			
 
				 # Predefined inputs