chat_vllm_benchmark.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import csv
  4. import json
  5. import time
  6. import threading
  7. import numpy as np
  8. import requests
  9. import transformers
  10. import torch
  11. # Imports for Azure content safety
  12. from azure.ai.contentsafety import ContentSafetyClient
  13. from azure.core.credentials import AzureKeyCredential
  14. from azure.core.exceptions import HttpResponseError
  15. from azure.ai.contentsafety.models import AnalyzeTextOptions
  16. from concurrent.futures import ThreadPoolExecutor, as_completed
  17. from typing import Tuple, List
  18. with open('input.jsonl') as input:
  19. prompt_data = json.load(input)
  20. # Prompt data stored in json file. Choose from number of tokens - 5, 25, 50, 100, 500, 1k, 2k.
  21. # You can also configure and add your own prompt in input.jsonl
  22. PROMPT = prompt_data["1k"]
  23. with open('parameters.json') as parameters:
  24. params = json.load(parameters)
  25. MAX_NEW_TOKENS = params["MAX_NEW_TOKENS"]
  26. CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
  27. # Replace with your own deployment
  28. MODEL_PATH = params["MODEL_PATH"]
  29. MODEL_HEADERS = params["MODEL_HEADERS"]
  30. SAFE_CHECK = params["SAFE_CHECK"]
  31. # Threshold for tokens per second below which we deem the query to be slow
  32. THRESHOLD_TPS = params["THRESHOLD_TPS"]
  33. TEMPERATURE = params["TEMPERATURE"]
  34. TOP_P = params["TOP_P"]
  35. # Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
  36. # Group of model endpoints - Send balanced requests to each endpoint for batch maximization.
  37. MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
  38. # Get number of GPUs on this instance
  39. if torch.cuda.is_available():
  40. NUM_GPU = torch.cuda.device_count()
  41. else:
  42. print("No available GPUs")
  43. # This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
  44. tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
  45. num_token_input_prompt = len(tokenizer.encode(PROMPT))
  46. print(f"Number of token for input prompt: {num_token_input_prompt}")
  47. # Azure content safety analysis
  48. def analyze_prompt(input):
  49. start_time = time.time()
  50. # Obtain credentials
  51. key = "" #Add your AZURE_CONTENT_SAFETY_KEY
  52. endpoint = "" #Add your AZURE_CONTENT_SAFETY_ENDPOINT
  53. # Create a content safety client
  54. client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
  55. # Create request
  56. request = AnalyzeTextOptions(text=input)
  57. # Analyze prompt
  58. try:
  59. response = client.analyze_text(request)
  60. except HttpResponseError as e:
  61. print("prompt failed due to content safety filtering.")
  62. if e.error:
  63. print(f"Error code: {e.error.code}")
  64. print(f"Error message: {e.error.message}")
  65. raise
  66. print(e)
  67. raise
  68. analyze_end_time = time.time()
  69. # The round trip latency for using Azure content safety check
  70. analyze_latency = (analyze_end_time - start_time) * 1000
  71. # Simple round-robin to dispatch requests into different containers
  72. executor_id = 0
  73. lock = threading.Lock()
  74. def generate_text() -> Tuple[int, int]:
  75. headers = MODEL_HEADERS
  76. payload = {
  77. "model" : MODEL_PATH,
  78. "messages" : [
  79. {
  80. "role": "user",
  81. "content": PROMPT
  82. }
  83. ],
  84. "stream" : False,
  85. "temperature" : TEMPERATURE,
  86. "top_p" : TOP_P,
  87. "max_tokens" : MAX_NEW_TOKENS
  88. }
  89. start_time = time.time()
  90. if(SAFE_CHECK):
  91. # Function to send prompts for safety check. Add delays for request round-trip that count towards overall throughput measurement.
  92. # Expect NO returns from calling this function. If you want to check the safety check results, print it out within the function itself.
  93. analyze_prompt(PROMPT)
  94. # Or add delay simulation if you don't want to use Azure Content Safety check. The API round-trip for this check is around 0.3-0.4 seconds depends on where you located. You can use something like this: time.sleep(random.uniform(0.3, 0.4))
  95. # Acquire lock to dispatch the request
  96. lock.acquire()
  97. global executor_id
  98. if executor_id != len(MODEL_ENDPOINTS)-1:
  99. executor_id += 1
  100. endpoint_id = executor_id
  101. else:
  102. executor_id = 0
  103. endpoint_id = executor_id
  104. lock.release()
  105. # Send request
  106. response = requests.post(MODEL_ENDPOINTS[endpoint_id], headers=headers, json=payload)
  107. if(SAFE_CHECK):
  108. # Function to send prompts for safety check. Add delays for request round-trip that count towards overall throughput measurement.
  109. # Expect NO returns from calling this function. If you want to check the safety check results, print it out within the function itself.
  110. analyze_prompt(PROMPT)
  111. # Or add delay simulation if you don't want to use Azure Content Safety check. The API round-trip for this check is around 0.3-0.4 seconds depends on where you located. You can use something like this: time.sleep(random.uniform(0.3, 0.4))
  112. end_time = time.time()
  113. # Convert to ms
  114. latency = (end_time - start_time) * 1000
  115. if response.status_code != 200:
  116. raise ValueError(f"Error: {response.content}")
  117. output = json.loads(response.content)["choices"][0]["message"]["content"]
  118. token_count = len(tokenizer.encode(output))
  119. return latency, token_count
  120. def evaluate_performance(concurrent_requests: int) -> Tuple[float, float, float, float, float, float, float, List[float]]:
  121. latencies = []
  122. total_output_tokens = 0
  123. output_tokens_per_second_each_request = []
  124. start_time = time.time()
  125. # Init multi-thread execution
  126. with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
  127. future_to_req = {executor.submit(generate_text): i for i in range(concurrent_requests)}
  128. for future in as_completed(future_to_req):
  129. latency, token_count = future.result()
  130. latencies.append(latency)
  131. total_output_tokens += token_count
  132. # Calculate tokens per second for this request
  133. tokens_per_sec = token_count / (latency / 1000)
  134. output_tokens_per_second_each_request.append(tokens_per_sec)
  135. end_time = time.time()
  136. total_time = end_time - start_time
  137. # RPS (requests per second)
  138. rps = concurrent_requests / total_time
  139. # Overall tokens per second
  140. output_tokens_per_second_overall = total_output_tokens / total_time
  141. input_tokens_per_second_overall = (num_token_input_prompt * concurrent_requests) / total_time
  142. output_tokens_per_second_per_gpu = output_tokens_per_second_overall / NUM_GPU
  143. input_tokens_per_second_per_gpu = input_tokens_per_second_overall / NUM_GPU
  144. p50_latency = np.percentile(latencies, 50)
  145. p99_latency = np.percentile(latencies, 99)
  146. # Count the number of requests below the token-per-second threshold
  147. below_threshold_count = sum(1 for tps in output_tokens_per_second_each_request if tps < THRESHOLD_TPS)
  148. output_tokens_per_second_per_request = sum(output_tokens_per_second_each_request)/len(output_tokens_per_second_each_request)
  149. return p50_latency, p99_latency, rps, output_tokens_per_second_overall, output_tokens_per_second_per_gpu, input_tokens_per_second_overall, input_tokens_per_second_per_gpu, output_tokens_per_second_per_request, below_threshold_count
  150. # Print markdown
  151. print("| Number of Concurrent Requests | P50 Latency (ms) | P99 Latency (ms) | RPS | Output Tokens per Second | Output Tokens per Second per GPU | Input Tokens per Second | Input Tokens per Second per GPU |Average Output Tokens per Second per Request | Number of Requests Below Threshold |")
  152. print("|-------------------------------|------------------|------------------|------------------|-------------------|---------------------------|---------------------|------------------------|-------------------------------------- | ---------------------------------- |")
  153. # Save to file
  154. csv_file = "performance_metrics.csv"
  155. with open(csv_file, "w", newline='') as f:
  156. writer = csv.writer(f)
  157. writer.writerow(["Number of Concurrent Requests", "P50 Latency (ms)", "P99 Latency (ms)", "RPS", "Output Tokens per Second", "Output Tokens per Second per GPU", "Input Tokens per Second", "Input Tokens per Second per GPU", "Average Output Tokens per Second per Request"])
  158. for level in CONCURRENT_LEVELS:
  159. p50_latency, p99_latency, rps, output_tokens_per_second_overall, output_tokens_per_second_per_gpu, input_tokens_per_second_overall, input_tokens_per_second_per_gpu, output_tokens_per_second_per_request, below_threshold_count = evaluate_performance(level)
  160. print(f"| {level} | {p50_latency:.2f} | {p99_latency:.2f} | {rps:.2f} | {output_tokens_per_second_overall:.2f} | {output_tokens_per_second_per_gpu:.2f} | {input_tokens_per_second_overall:.2f} | {input_tokens_per_second_per_gpu:.2f} | {output_tokens_per_second_per_request:.2f} | {below_threshold_count:.2f} |")
  161. writer.writerow([level, round(p50_latency, 2), round(p99_latency, 2), round(rps, 2), round(output_tokens_per_second_overall, 2), round(output_tokens_per_second_per_gpu, 2), round(input_tokens_per_second_overall, 2), round(input_tokens_per_second_per_gpu, 2), round(output_tokens_per_second_per_request, 2)])