| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 | 
							- # Copyright (c) Meta Platforms, Inc. and affiliates.
 
- # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
- import csv
 
- import json
 
- import time
 
- import random
 
- import urllib.request
 
- import numpy as np
 
- import transformers
 
- from concurrent.futures import ThreadPoolExecutor, as_completed
 
- from typing import Dict, Tuple, List
 
- # Predefined inputs
 
- with open('input.jsonl') as input:
 
-     prompt_data = json.load(input)
 
- with open('parameters.json') as parameters:
 
-     params = json.load(parameters)
 
- MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
 
- CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
 
- # Threshold for tokens per second below which we deem the query to be slow
 
- THRESHOLD_TPS = params["THRESHOLD_TPS"] 
 
- # Default Llama 2 tokenizer, replace with your own tokenizer 
 
- TOKENIZER_PATH = params["TOKENIZER_PATH"]
 
- RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
 
- TEMPERATURE = params["TEMPERATURE"]
 
- TOP_P = params["TOP_P"]
 
- # Model endpoint provided with API provider 
 
- MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 
- API_KEY = params["API_KEY"]
 
- # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
 
- tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 
- # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
 
- vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]
 
- def generate_random_prompt(num_tokens):
 
-     generated_tokens_count = 0
 
-     selected_tokens = ""
 
-     while generated_tokens_count < num_tokens:
 
-         selected_tokens += random.choice(vocab)
 
-         selected_tokens += " "
 
-         generated_tokens_count = len(tokenizer.encode(selected_tokens))
 
-     return selected_tokens
 
- PROMPT = generate_random_prompt(RANDOM_PROMPT_LENGTH)
 
- num_token_input_prompt = len(tokenizer.encode(PROMPT))
 
- print(f"Number of token for input prompt: {num_token_input_prompt}")
 
- def generate_text() -> Tuple[int, int]:
 
-     #Configure payload data sending to API endpoint
 
-     payload = {"prompt": PROMPT, 
 
-                "max_tokens": MAX_NEW_TOKEN, 
 
-                "temperature": TEMPERATURE,
 
-                "top_p": TOP_P,      
 
-     }
 
-     body = str.encode(json.dumps(payload))
 
-     url = MODEL_ENDPOINTS
 
-     api_key = API_KEY
 
-     if not api_key:
 
-         raise Exception("API Key is missing")
 
-     
 
-     headers = {'Content-Type':'application/json', 'Authorization':(api_key)}
 
-     req = urllib.request.Request(url, body, headers)
 
-     token_count = 0
 
-     output = ""
 
-     start_time = time.time()
 
-     # Send request
 
-     try:
 
-         response = urllib.request.urlopen(req)
 
-         result = response.read()
 
-         output = json.loads(result)["choices"][0]["text"]
 
-         
 
-     except urllib.error.HTTPError as error:
 
-         print("The request failed with status code: " + str(error.code))
 
-         # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
 
-         print(error.info())
 
-         print(error.read().decode("utf8", 'ignore'))
 
-     end_time = time.time()
 
-     # Convert to ms
 
-     latency = (end_time - start_time) * 1000  
 
-     token_count = len(tokenizer.encode(output))
 
-     return latency, token_count
 
- def evaluate_performance(concurrent_requests: int) -> Tuple[float, float, float, float, float, float, float, List[float]]:
 
-     latencies = []
 
-     total_output_tokens = 0
 
-     output_tokens_per_second_each_request = []
 
-     start_time = time.time()
 
-     # Init multi-thread execution 
 
-     with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
 
-         future_to_req = {executor.submit(generate_text): i for i in range(concurrent_requests)}
 
-         for future in as_completed(future_to_req):
 
-             latency, token_count = future.result()
 
-             latencies.append(latency)
 
-             total_output_tokens += token_count
 
-             # Calculate tokens per second for this request
 
-             tokens_per_sec = token_count / (latency / 1000)
 
-             output_tokens_per_second_each_request.append(tokens_per_sec)
 
-     end_time = time.time()
 
-     total_time = end_time - start_time
 
-     # RPS (requests per second)
 
-     rps = concurrent_requests / total_time  
 
-     # Overall tokens per second
 
-     output_tokens_per_second_overall = total_output_tokens / total_time  
 
-     input_tokens_per_second_overall = (num_token_input_prompt * concurrent_requests) / total_time
 
-     p50_latency = np.percentile(latencies, 50)
 
-     p99_latency = np.percentile(latencies, 99)
 
-     # Count the number of requests below the token-per-second threshold
 
-     below_threshold_count = sum(1 for tps in output_tokens_per_second_each_request if tps < THRESHOLD_TPS)
 
-     output_tokens_per_second_per_request = sum(output_tokens_per_second_each_request)/len(output_tokens_per_second_each_request)
 
-     return p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count
 
- # Print markdown
 
- print("| Number of Concurrent Requests | P50 Latency (ms) | P99 Latency (ms) | RPS | Output Tokens per Second | Input Tokens per Second | Average Output Tokens per Second per Request | Number of Requests Below Threshold |")
 
- print("|-------------------------------|------------------|------------------|-----|--------------------------|-------------------------|----------------------------------------------|------------------------------------|")
 
- # Save to file
 
- csv_file = "performance_metrics.csv"
 
- with open(csv_file, "w", newline='') as f:
 
-     writer = csv.writer(f)
 
-     writer.writerow(["Number of Concurrent Requests", "P50 Latency (ms)", "P99 Latency (ms)", "RPS", "Output Tokens per Second", "Input Tokens per Second", "Average Output Tokens per Second per Request"])
 
-     for level in CONCURRENT_LEVELS:
 
-         p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count = evaluate_performance(level)
 
-         print(f"| {level} | {p50_latency:.2f} | {p99_latency:.2f} | {rps:.2f} | {output_tokens_per_second_overall:.2f} | {input_tokens_per_second_overall:.2f} | {output_tokens_per_second_per_request:.2f} | {below_threshold_count:.2f} |")
 
-         writer.writerow([level, round(p50_latency, 2), round(p99_latency, 2), round(rps, 2), round(output_tokens_per_second_overall, 2), round(input_tokens_per_second_overall, 2), round(output_tokens_per_second_per_request, 2)])
 
 
  |