преди 1 година · f01bbe233d
--- a/recipes/3p_integration/azure/README.md
+++ b/recipes/3p_integration/azure/README.md
@@ -0,0 +1,5 @@
 
				+In this folder, we show various examples in a notebook for running Llama model inference on Azure's serverless API offerings. We will cover:  
			
 
				+* HTTP requests API usage for Llama 3 instruct models in CLI
			
 
				+* HTTP requests API usage for Llama 3 instruct models in Python
			
 
				+* Plug the APIs into LangChain
			
 
				+* Wire the model with Gradio to build a simple chatbot with memory
			
--- a/recipes/3p_integration/azure/azure_api_example.ipynb
+++ b/recipes/3p_integration/azure/azure_api_example.ipynb
@@ -96,7 +96,7 @@
 
				     "Streaming allows the generated tokens to be sent as data-only server-sent events whenever they become available.  \n",
			
 
				     "This is extremely important for interactive applications such as chatbots, so the user is always engaged.  \n",
			
 
				     "\n",
			
 
				-    "To use streaming, simply set `\"stream\":\"True\"` as part of the request payload.  \n",
			
 
				+    "To use streaming, simply set `\"stream\":True` as part of the request payload.  \n",
			
 
				     "In the streaming mode, the REST API response will be different from non-streaming mode.\n",
			
 
				     "\n",
			
 
				     "Here is an example: "
			
@@ -108,7 +108,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"Who wrote the book Innovators dilemma?\",\"role\":\"user\"}], \"max_tokens\": 500, \"stream\": \"True\"}'"
			
 
				+    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"Who wrote the book Innovators dilemma?\",\"role\":\"user\"}], \"max_tokens\": 500, \"stream\": True}'"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -170,7 +170,7 @@
 
				     "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}], \n",
			
 
				     "        \"max_tokens\": 500,\n",
			
 
				     "        \"temperature\": 0.9,\n",
			
 
				-    "        \"stream\": \"True\",\n",
			
 
				+    "        \"stream\": True,\n",
			
 
				     "}\n",
			
 
				     "\n",
			
 
				     "body = str.encode(json.dumps(data))\n",
			
@@ -230,7 +230,7 @@
 
				     "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}],\n",
			
 
				     "        \"max_tokens\": 500,\n",
			
 
				     "        \"temperature\": 0.9,\n",
			
 
				-    "        \"stream\": \"True\"\n",
			
 
				+    "        \"stream\": True\n",
			
 
				     "}\n",
			
 
				     "\n",
			
 
				     "\n",
			
--- a/tools/benchmarks/README.md
+++ b/tools/benchmarks/README.md
@@ -0,0 +1,4 @@
 
				+# Benchmarks
			
 
				+
			
 
				+* inference - a folder contains benchmark scripts that apply a throughput analysis for Llama models inference on various backends including on-prem, cloud and on-device.
			
 
				+* llm_eval_harness - a folder contains a tool to evaluate fine-tuned Llama models including quantized models focusing on quality.  
			
--- a/tools/benchmarks/inference/README.md
+++ b/tools/benchmarks/inference/README.md
@@ -1,8 +1,8 @@
 
				 # Inference Throughput Benchmarks
			
 
				-In this folder we provide a series of benchmark scripts that apply a throughput analysis for Llama 2 models inference on various backends:
			
 
				+In this folder we provide a series of benchmark scripts that apply a throughput analysis for Llama models inference on various backends:
			
 
				 * On-prem - Popular serving frameworks and containers (i.e. vLLM)
			
 
				-* [**WIP**]Cloud API - Popular API services (i.e. Azure Model-as-a-Service)
			
 
				-* [**WIP**]On-device - Popular on-device inference solutions on Android and iOS (i.e. mlc-llm, QNN)
			
 
				+* Cloud API - Popular API services (i.e. Azure Model-as-a-Service or Serverless API)
			
 
				+* [**WIP**]On-device - Popular on-device inference solutions on mobile and desktop (i.e. ExecuTorch, MLC-LLM, Ollama)
			
 
				 * [**WIP**]Optimization - Popular optimization solutions for faster inference and quantization (i.e. AutoAWQ)
			
 
				 
			
 
				 # Why
			
@@ -16,7 +16,7 @@ Here are the parameters (if applicable) that you can configure for running the b
 
				 * **PROMPT** - Prompt sent in for inference (configure the length of prompt, choose from 5, 25, 50, 100, 500, 1k and 2k)
			
 
				 * **MAX_NEW_TOKENS** - Max number of tokens generated
			
 
				 * **CONCURRENT_LEVELS** - Max number of concurrent requests
			
 
				-* **MODEL_PATH** - Model source
			
 
				+* **MODEL_PATH** - Model source from Huggingface
			
 
				 * **MODEL_HEADERS** - Request headers
			
 
				 * **SAFE_CHECK** - Content safety check (either Azure service or simulated latency)
			
 
				 * **THRESHOLD_TPS** - Threshold TPS (threshold for tokens per second below which we deem the query to be slow)
			
--- a/tools/benchmarks/inference/cloud/README.md
+++ b/tools/benchmarks/inference/cloud/README.md
@@ -13,13 +13,18 @@ To get started, there are certain steps we need to take to deploy the models:
 
				 * Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article
			
 
				 * Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)
			
 
				 * Select Llama models from Model catalog
			
 
				-* Deploy with "Pay-as-you-go"
			
 
				+* Click the "Deploy" button
			
 
				+* Select Serverless API with Azure AI Content Safety. Note that currently this API service is offered for Llama 2 pretrained model, chat model and Llama 3 instruct model
			
 
				+* Select the project you created in previous step
			
 
				+* Choose a deployment name then Go to deployment
			
 
				 
			
 
				 Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.
			
 
				 For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference.
			
 
				 
			
 
				 Now, replace the endpoint url and API key in ```azure/parameters.json```. For parameter `MODEL_ENDPOINTS`, with chat models the suffix should be `v1/chat/completions` and with pretrained models the suffix should be `v1/completions`.
			
 
				-Note that the API endpoint might implemented a rate limit for token generation in certain amount of time. If you encountered the error, you can try reduce `MAX_NEW_TOKEN` or start with smaller `CONCURRENT_LEVELs`.
			
 
				+Note that the API endpoint might implemented a rate limit for token generation in certain amount of time. If you encountered the error, you can try reduce `MAX_NEW_TOKEN` or start with smaller `CONCURRENT_LEVELS`.
			
 
				+
			
 
				+For `MODEL_PATH`, copy the model path from Huggingface under meta-llama organization. For Llama 2, make sure you copy the path of the model with hf format. This model path is used to retrieve corresponding tokenizer for your model of choice. Llama 3 used a different tokenizer compare to Llama 2.
			
 
				 
			
 
				 Once everything configured, to run chat model benchmark:
			
 
				 ```python chat_azure_api_benchmark.py```
			
--- a/tools/benchmarks/inference/cloud/azure/chat_azure_api_benchmark.py
+++ b/tools/benchmarks/inference/cloud/azure/chat_azure_api_benchmark.py
@@ -10,6 +10,7 @@ import transformers
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from typing import Dict, Tuple, List
			
 
				 
			
 
				+# Add your own prompt in input.jsonl for testing.
			
 
				 with open('input.jsonl') as input:
			
 
				     prompt_data = json.load(input)
			
 
				 
			
@@ -23,8 +24,7 @@ MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
 
				 CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Default Llama 2 tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				+MODEL_PATH = params["MODEL_PATH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
 
				 # Model endpoint provided with API provider 
			
@@ -32,14 +32,12 @@ MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 
				 API_KEY = params["API_KEY"]
			
 
				 SYS_PROMPT = params["SYS_PROMPT"]
			
 
				 
			
 
				-
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 num_token_input_prompt = len(tokenizer.encode(PROMPT))
			
 
				 print(f"Number of token for input prompt: {num_token_input_prompt}")
			
 
				 
			
 
				-
			
 
				 def generate_text() -> Tuple[int, int]:
			
 
				 
			
 
				     #Configure payload data sending to API endpoint
			
@@ -49,7 +47,7 @@ def generate_text() -> Tuple[int, int]:
 
				             "max_tokens": MAX_NEW_TOKEN,
			
 
				             "temperature": TEMPERATURE,
			
 
				             "top_p" : TOP_P,
			
 
				-            "stream": "False"
			
 
				+            "stream": False
			
 
				     }
			
 
				     body = str.encode(json.dumps(payload))
			
 
				     url = MODEL_ENDPOINTS
			
--- a/tools/benchmarks/inference/cloud/azure/parameters.json
+++ b/tools/benchmarks/inference/cloud/azure/parameters.json
@@ -2,11 +2,11 @@
 
				     "MAX_NEW_TOKEN" : 256,
			
 
				     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64],
			
 
				     "THRESHOLD_TPS" : 7,
			
 
				-    "TOKENIZER_PATH" : "../../tokenizer",
			
 
				-    "RANDOM_PROMPT_LENGTH" : 1000,
			
 
				+    "MODEL_PATH" : "meta-llama/your-model-path",
			
 
				+    "RANDOM_PROMPT_LENGTH" : 25,
			
 
				     "TEMPERATURE" : 0.6,
			
 
				     "TOP_P" : 0.9,
			
 
				-    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/completions",
			
 
				+    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/chat/completions",
			
 
				     "API_KEY" : "your-auth-key",
			
 
				     "SYS_PROMPT" : "You are a helpful assistant."
			
 
				 }
			
--- a/tools/benchmarks/inference/cloud/azure/pretrained_azure_api_benchmark.py
+++ b/tools/benchmarks/inference/cloud/azure/pretrained_azure_api_benchmark.py
@@ -11,7 +11,7 @@ import transformers
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from typing import Dict, Tuple, List
			
 
				 
			
 
				-# Predefined inputs
			
 
				+# Predefined inputs - optional
			
 
				 with open('input.jsonl') as input:
			
 
				     prompt_data = json.load(input)
			
 
				 
			
@@ -23,7 +23,7 @@ CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				 # Default Llama 2 tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"]
			
 
				+MODEL_PATH = params["MODEL_PATH"]
			
 
				 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
@@ -32,8 +32,8 @@ MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 
				 API_KEY = params["API_KEY"]
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
			
 
				 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]
			
--- a/tools/benchmarks/inference/on_prem/README.md
+++ b/tools/benchmarks/inference/on_prem/README.md
@@ -37,3 +37,5 @@ To run pretrained model benchmark, follow the command below.
 
				 ```
			
 
				 python pretrained_vllm_benchmark.py
			
 
				 ```
			
 
				+
			
 
				+Refer to more vLLM benchmark details on their official Github repo [here](https://github.com/vllm-project/vllm/tree/main/benchmarks).
			
--- a/tools/benchmarks/inference/on_prem/vllm/chat_vllm_benchmark.py
+++ b/tools/benchmarks/inference/on_prem/vllm/chat_vllm_benchmark.py
@@ -4,7 +4,6 @@
 
				 import csv
			
 
				 import json
			
 
				 import time
			
 
				-import random
			
 
				 import threading
			
 
				 import numpy as np
			
 
				 import requests
			
@@ -18,7 +17,7 @@ from azure.core.exceptions import HttpResponseError
 
				 from azure.ai.contentsafety.models import AnalyzeTextOptions
			
 
				 
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from typing import Dict, Tuple, List
			
 
				+from typing import Tuple, List
			
 
				 
			
 
				 
			
 
				 
			
--- a/tools/benchmarks/inference/on_prem/vllm/parameters.json
+++ b/tools/benchmarks/inference/on_prem/vllm/parameters.json
@@ -1,7 +1,7 @@
 
				 {
			
 
				     "MAX_NEW_TOKENS" : 256,
			
 
				     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
			
 
				-    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
			
 
				+    "MODEL_PATH" : "meta-llama/your-model-path",
			
 
				     "MODEL_HEADERS" : {"Content-Type": "application/json"},
			
 
				     "SAFE_CHECK" : true,
			
 
				     "THRESHOLD_TPS" : 7,
			
--- a/tools/benchmarks/inference/on_prem/vllm/pretrained_vllm_benchmark.py
+++ b/tools/benchmarks/inference/on_prem/vllm/pretrained_vllm_benchmark.py
@@ -18,7 +18,7 @@ from azure.core.exceptions import HttpResponseError
 
				 from azure.ai.contentsafety.models import AnalyzeTextOptions
			
 
				 
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from typing import Dict, Tuple, List
			
 
				+from typing import Tuple, List
			
 
				 
			
 
				 
			
 
				 # Predefined inputs