2 gadi atpakaļ · 3d5c701569
--- a/recipes/benchmarks/fmbench/README.md
+++ b/recipes/benchmarks/fmbench/README.md
@@ -99,11 +99,12 @@ Llama3 is now available on Bedrock (read [blog post](https://aws.amazon.com/blog
 
				 
			
 
				 ## 🚨 Benchmarking Llama3 on Amazon SageMaker 🚨
			
 
				 
			
 
				-Llama3 is now available on SageMaker (read [blog post](https://aws.amazon.com/blogs/machine-learning/meta-llama-3-models-are-now-available-in-amazon-sagemaker-jumpstart/)), and you can now benchmark it using `FMBench`. Here are the config files for benchmarking `Llama3-8b-instruct` and `Llama3-70b-instruct` on `ml.p4d.24xlarge` and `ml.g5.12xlarge` instance.
			
 
				+Llama3 is now available on SageMaker (read [blog post](https://aws.amazon.com/blogs/machine-learning/meta-llama-3-models-are-now-available-in-amazon-sagemaker-jumpstart/)), and you can now benchmark it using `FMBench`. Here are the config files for benchmarking `Llama3-8b-instruct` and `Llama3-70b-instruct` on `ml.p4d.24xlarge`, `ml.inf2.24xlarge` and `ml.g5.12xlarge` instances.
			
 
				 
			
 
				 <!-- markdown-link-check-disable -->
			
 
				 - [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-8b-instruct-g5-p4d.yml) for `Llama3-8b-instruct` on  `ml.p4d.24xlarge` and `ml.g5.12xlarge`.
			
 
				 - [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-70b-instruct-g5-p4d.yml) for `Llama3-70b-instruct` on  `ml.p4d.24xlarge` and `ml.g5.48xlarge`.
			
 
				+- [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-8b-inf2-g5.yml) for `Llama3-8b-instruct` on  `ml.inf2.24xlarge` and `ml.g5.12xlarge`.
			
 
				 <!-- markdown-link-check-enable -->
			
 
				 
			
 
				 ## Benchmarking Llama2 on Amazon SageMaker
			
--- a/recipes/benchmarks/fmbench/config.yml
+++ b/recipes/benchmarks/fmbench/config.yml
@@ -9,7 +9,7 @@ aws:
 
				   # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
			
 
				   sagemaker_execution_role: {role_arn}
			
 
				   # S3 bucket to which metrics, plots and reports would be written to
			
 
				-  bucket: {write_bucket} ## add the name of your desired bucket
			
 
				+  bucket: {write_bucket}
			
 
				 
			
 
				 # directory paths in the write bucket, no need to change these
			
 
				 dir_paths:
			
@@ -22,9 +22,10 @@ dir_paths:
 
				 
			
 
				 # S3 information for reading datasets, scripts and tokenizer
			
 
				 s3_read_data:
			
 
				-  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-{region}-{account_id}
			
 
				+  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>
			
 
				   read_bucket: {read_bucket}
			
 
				-    
			
 
				+  scripts_prefix: scripts
			
 
				+  
			
 
				   # S3 prefix in the read bucket where deployment and inference scripts should be placed
			
 
				   scripts_prefix: scripts
			
 
				     
			
@@ -52,13 +53,12 @@ s3_read_data:
 
				   - narrativeqa.jsonl
			
 
				   - triviaqa_e.jsonl
			
 
				   - triviaqa.jsonl
			
 
				-
			
 
				   # S3 prefix for the tokenizer to be used with the models
			
 
				   # NOTE 1: the same tokenizer is used with all the models being tested through a config file
			
 
				   # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
			
 
				-  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in  llama2_tokenizer
			
 
				+  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.
			
 
				   tokenizer_prefix: tokenizer
			
 
				-
			
 
				+  
			
 
				   # S3 prefix for prompt templates
			
 
				   prompt_template_dir: prompt_template
			
 
				 
			
@@ -79,7 +79,7 @@ run_steps:
 
				   4_model_metric_analysis.ipynb: yes
			
 
				   5_cleanup.ipynb: yes
			
 
				 
			
 
				-# dataset related configuration
			
 
				+
			
 
				 datasets:
			
 
				   # Refer to the 1_generate_data.ipynb notebook
			
 
				   # the dataset you use is expected to have the 
			
@@ -89,7 +89,7 @@ datasets:
 
				   prompt_template_keys:
			
 
				   - input
			
 
				   - context
			
 
				-
			
 
				+  
			
 
				   # if your dataset has multiple languages and it has a language
			
 
				   # field then you could filter it for a language. Similarly,
			
 
				   # you can filter your dataset to only keep prompts between
			
@@ -125,7 +125,7 @@ datasets:
 
				 # dataset which is listed below as the dataset_of_interest
			
 
				 metrics:
			
 
				   dataset_of_interest: en_2000-3000
			
 
				-  
			
 
				+
			
 
				 # all pricing information is in the pricing.yml file
			
 
				 # this file is provided in the repo. You can add entries
			
 
				 # to this file for new instance types and new Bedrock models
			
@@ -156,18 +156,18 @@ experiments:
 
				     # model_id is interpreted in conjunction with the deployment_script, so if you
			
 
				     # use a JumpStart model id then set the deployment_script to jumpstart.py.
			
 
				     # if deploying directly from HuggingFace this would be a HuggingFace model id
			
 
				-    # see the DJL serving deployment script in the code repo for reference.    
			
 
				+    # see the DJL serving deployment script in the code repo for reference.
			
 
				     model_id: meta-textgeneration-llama-2-7b-f
			
 
				     model_version: "3.*"
			
 
				     model_name: llama2-7b-f
			
 
				     ep_name: llama-2-7b-g5xlarge
			
 
				     instance_type: "ml.g5.xlarge"
			
 
				     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				-    deploy: yes    
			
 
				+    deploy: yes
			
 
				     instance_count: 1
			
 
				     # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
			
 
				     # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
			
 
				-    # See repo for details
			
 
				+    # See repo for details    
			
 
				     deployment_script: jumpstart.py
			
 
				     # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
			
 
				     # and Bedrock. You can also add your own. See repo for details
			
@@ -181,6 +181,7 @@ experiments:
 
				     - payload_en_500-1000.jsonl
			
 
				     - payload_en_1000-2000.jsonl
			
 
				     - payload_en_2000-3000.jsonl
			
 
				+    #- payload_en_3000-3840.jsonl
			
 
				     # concurrency level refers to number of requests sent in parallel to an endpoint
			
 
				     # the next set of requests is sent once responses for all concurrent requests have
			
 
				     # been received.
			
@@ -188,7 +189,7 @@ experiments:
 
				     - 1
			
 
				     - 2
			
 
				     - 4
			
 
				-    # Added for models that require accepting a EULA
			
 
				+
			
 
				     accept_eula: true
			
 
				     # Environment variables to be passed to the container
			
 
				     # this is not a fixed list, you can add more parameters as applicable.
			
@@ -204,6 +205,10 @@ experiments:
 
				       SAGEMAKER_MODEL_SERVER_WORKERS: "1"
			
 
				 
			
 
				   - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
			
 
				+    # model_id is interpreted in conjunction with the deployment_script, so if you
			
 
				+    # use a JumpStart model id then set the deployment_script to jumpstart.py.
			
 
				+    # if deploying directly from HuggingFace this would be a HuggingFace model id
			
 
				+    # see the DJL serving deployment script in the code repo for reference. 
			
 
				     model_id: meta-textgeneration-llama-2-7b-f
			
 
				     model_version: "3.*"
			
 
				     model_name: llama2-7b-f
			
@@ -211,23 +216,36 @@ experiments:
 
				     instance_type: "ml.g5.2xlarge"
			
 
				     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				     deploy: yes
			
 
				+    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
			
 
				+    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
			
 
				+    # See repo for details
			
 
				     instance_count: 1
			
 
				     deployment_script: jumpstart.py
			
 
				+    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
			
 
				+    # and Bedrock. You can also add your own. See repo for details
			
 
				     inference_script: sagemaker_predictor.py
			
 
				     inference_spec:
			
 
				+      # this should match one of the sections in the inference_parameters section above
			
 
				       parameter_set: sagemaker
			
 
				+    # runs are done for each combination of payload file and concurrency level
			
 
				     payload_files:
			
 
				     - payload_en_1-500.jsonl
			
 
				     - payload_en_500-1000.jsonl
			
 
				     - payload_en_1000-2000.jsonl
			
 
				     - payload_en_2000-3000.jsonl
			
 
				-
			
 
				+    #- payload_en_3000-3840.jsonl
			
 
				+    
			
 
				+    # concurrency level refers to number of requests sent in parallel to an endpoint
			
 
				+    # the next set of requests is sent once responses for all concurrent requests have
			
 
				+    # been received.
			
 
				     concurrency_levels:
			
 
				     - 1
			
 
				     - 2
			
 
				     - 4
			
 
				-
			
 
				+    # Added for models that require accepting a EULA
			
 
				     accept_eula: true
			
 
				+    # Environment variables to be passed to the container
			
 
				+    # this is not a fixed list, you can add more parameters as applicable.
			
 
				     env:
			
 
				       SAGEMAKER_PROGRAM: "inference.py"
			
 
				       ENDPOINT_SERVER_TIMEOUT: "3600"
			
@@ -249,7 +267,6 @@ report:
 
				   latency_budget: 2
			
 
				   cost_per_10k_txn_budget: 20
			
 
				   error_rate_budget: 0
			
 
				-
			
 
				   # other misc reporting parameters, see 4_model_metric_analysis.ipynb
			
 
				   # for more information
			
 
				   per_inference_request_file: per_inference_request_results.csv