2 anos atrás · 3d914de2d6
--- a/recipes/benchmarks/inference_throughput/cloud-api/azure/chat_azure_api_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/cloud-api/azure/chat_azure_api_benchmark.py
@@ -10,6 +10,7 @@ import transformers
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from typing import Dict, Tuple, List
			
 
				 
			
 
				+# Add your own prompt in input.jsonl for testing.
			
 
				 with open('input.jsonl') as input:
			
 
				     prompt_data = json.load(input)
			
 
				 
			
@@ -23,8 +24,7 @@ MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
 
				 CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Default Llama 2 tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				+MODEL_PATH = params["MODEL_PATH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
 
				 # Model endpoint provided with API provider 
			
@@ -32,14 +32,12 @@ MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 
				 API_KEY = params["API_KEY"]
			
 
				 SYS_PROMPT = params["SYS_PROMPT"]
			
 
				 
			
 
				-
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 num_token_input_prompt = len(tokenizer.encode(PROMPT))
			
 
				 print(f"Number of token for input prompt: {num_token_input_prompt}")
			
 
				 
			
 
				-
			
 
				 def generate_text() -> Tuple[int, int]:
			
 
				 
			
 
				     #Configure payload data sending to API endpoint
			
@@ -49,7 +47,7 @@ def generate_text() -> Tuple[int, int]:
 
				             "max_tokens": MAX_NEW_TOKEN,
			
 
				             "temperature": TEMPERATURE,
			
 
				             "top_p" : TOP_P,
			
 
				-            "stream": "False"
			
 
				+            "stream": False
			
 
				     }
			
 
				     body = str.encode(json.dumps(payload))
			
 
				     url = MODEL_ENDPOINTS
			
--- a/recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json
+++ b/recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json
@@ -2,11 +2,11 @@
 
				     "MAX_NEW_TOKEN" : 256,
			
 
				     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64],
			
 
				     "THRESHOLD_TPS" : 7,
			
 
				-    "TOKENIZER_PATH" : "../../tokenizer",
			
 
				-    "RANDOM_PROMPT_LENGTH" : 1000,
			
 
				+    "MODEL_PATH" : "meta-llama/your-model-path",
			
 
				+    "RANDOM_PROMPT_LENGTH" : 25,
			
 
				     "TEMPERATURE" : 0.6,
			
 
				     "TOP_P" : 0.9,
			
 
				-    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/completions",
			
 
				+    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/chat/completions",
			
 
				     "API_KEY" : "your-auth-key",
			
 
				     "SYS_PROMPT" : "You are a helpful assistant."
			
 
				 }
			
--- a/recipes/benchmarks/inference_throughput/cloud-api/azure/pretrained_azure_api_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/cloud-api/azure/pretrained_azure_api_benchmark.py
@@ -11,7 +11,7 @@ import transformers
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from typing import Dict, Tuple, List
			
 
				 
			
 
				-# Predefined inputs
			
 
				+# Predefined inputs - optional
			
 
				 with open('input.jsonl') as input:
			
 
				     prompt_data = json.load(input)
			
 
				 
			
@@ -23,7 +23,7 @@ CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				 # Default Llama 2 tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"]
			
 
				+MODEL_PATH = params["MODEL_PATH"]
			
 
				 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
@@ -32,8 +32,8 @@ MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 
				 API_KEY = params["API_KEY"]
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
			
 
				 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]