|
@@ -9,7 +9,7 @@ aws:
|
|
|
# SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
|
|
|
sagemaker_execution_role: {role_arn}
|
|
|
# S3 bucket to which metrics, plots and reports would be written to
|
|
|
- bucket: {write_bucket} ## add the name of your desired bucket
|
|
|
+ bucket: {write_bucket}
|
|
|
|
|
|
# directory paths in the write bucket, no need to change these
|
|
|
dir_paths:
|
|
@@ -22,9 +22,10 @@ dir_paths:
|
|
|
|
|
|
# S3 information for reading datasets, scripts and tokenizer
|
|
|
s3_read_data:
|
|
|
- # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-{region}-{account_id}
|
|
|
+ # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>
|
|
|
read_bucket: {read_bucket}
|
|
|
-
|
|
|
+ scripts_prefix: scripts
|
|
|
+
|
|
|
# S3 prefix in the read bucket where deployment and inference scripts should be placed
|
|
|
scripts_prefix: scripts
|
|
|
|
|
@@ -52,13 +53,12 @@ s3_read_data:
|
|
|
- narrativeqa.jsonl
|
|
|
- triviaqa_e.jsonl
|
|
|
- triviaqa.jsonl
|
|
|
-
|
|
|
# S3 prefix for the tokenizer to be used with the models
|
|
|
# NOTE 1: the same tokenizer is used with all the models being tested through a config file
|
|
|
# NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
|
|
|
- # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer
|
|
|
+ # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.
|
|
|
tokenizer_prefix: tokenizer
|
|
|
-
|
|
|
+
|
|
|
# S3 prefix for prompt templates
|
|
|
prompt_template_dir: prompt_template
|
|
|
|
|
@@ -79,7 +79,7 @@ run_steps:
|
|
|
4_model_metric_analysis.ipynb: yes
|
|
|
5_cleanup.ipynb: yes
|
|
|
|
|
|
-# dataset related configuration
|
|
|
+
|
|
|
datasets:
|
|
|
# Refer to the 1_generate_data.ipynb notebook
|
|
|
# the dataset you use is expected to have the
|
|
@@ -89,7 +89,7 @@ datasets:
|
|
|
prompt_template_keys:
|
|
|
- input
|
|
|
- context
|
|
|
-
|
|
|
+
|
|
|
# if your dataset has multiple languages and it has a language
|
|
|
# field then you could filter it for a language. Similarly,
|
|
|
# you can filter your dataset to only keep prompts between
|
|
@@ -125,7 +125,7 @@ datasets:
|
|
|
# dataset which is listed below as the dataset_of_interest
|
|
|
metrics:
|
|
|
dataset_of_interest: en_2000-3000
|
|
|
-
|
|
|
+
|
|
|
# all pricing information is in the pricing.yml file
|
|
|
# this file is provided in the repo. You can add entries
|
|
|
# to this file for new instance types and new Bedrock models
|
|
@@ -156,18 +156,18 @@ experiments:
|
|
|
# model_id is interpreted in conjunction with the deployment_script, so if you
|
|
|
# use a JumpStart model id then set the deployment_script to jumpstart.py.
|
|
|
# if deploying directly from HuggingFace this would be a HuggingFace model id
|
|
|
- # see the DJL serving deployment script in the code repo for reference.
|
|
|
+ # see the DJL serving deployment script in the code repo for reference.
|
|
|
model_id: meta-textgeneration-llama-2-7b-f
|
|
|
model_version: "3.*"
|
|
|
model_name: llama2-7b-f
|
|
|
ep_name: llama-2-7b-g5xlarge
|
|
|
instance_type: "ml.g5.xlarge"
|
|
|
image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
|
|
|
- deploy: yes
|
|
|
+ deploy: yes
|
|
|
instance_count: 1
|
|
|
# FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
|
|
|
# scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
|
|
|
- # See repo for details
|
|
|
+ # See repo for details
|
|
|
deployment_script: jumpstart.py
|
|
|
# FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
|
|
|
# and Bedrock. You can also add your own. See repo for details
|
|
@@ -181,6 +181,7 @@ experiments:
|
|
|
- payload_en_500-1000.jsonl
|
|
|
- payload_en_1000-2000.jsonl
|
|
|
- payload_en_2000-3000.jsonl
|
|
|
+ #- payload_en_3000-3840.jsonl
|
|
|
# concurrency level refers to number of requests sent in parallel to an endpoint
|
|
|
# the next set of requests is sent once responses for all concurrent requests have
|
|
|
# been received.
|
|
@@ -188,7 +189,7 @@ experiments:
|
|
|
- 1
|
|
|
- 2
|
|
|
- 4
|
|
|
- # Added for models that require accepting a EULA
|
|
|
+
|
|
|
accept_eula: true
|
|
|
# Environment variables to be passed to the container
|
|
|
# this is not a fixed list, you can add more parameters as applicable.
|
|
@@ -204,6 +205,10 @@ experiments:
|
|
|
SAGEMAKER_MODEL_SERVER_WORKERS: "1"
|
|
|
|
|
|
- name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
|
|
|
+ # model_id is interpreted in conjunction with the deployment_script, so if you
|
|
|
+ # use a JumpStart model id then set the deployment_script to jumpstart.py.
|
|
|
+ # if deploying directly from HuggingFace this would be a HuggingFace model id
|
|
|
+ # see the DJL serving deployment script in the code repo for reference.
|
|
|
model_id: meta-textgeneration-llama-2-7b-f
|
|
|
model_version: "3.*"
|
|
|
model_name: llama2-7b-f
|
|
@@ -211,23 +216,36 @@ experiments:
|
|
|
instance_type: "ml.g5.2xlarge"
|
|
|
image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
|
|
|
deploy: yes
|
|
|
+ # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
|
|
|
+ # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
|
|
|
+ # See repo for details
|
|
|
instance_count: 1
|
|
|
deployment_script: jumpstart.py
|
|
|
+ # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
|
|
|
+ # and Bedrock. You can also add your own. See repo for details
|
|
|
inference_script: sagemaker_predictor.py
|
|
|
inference_spec:
|
|
|
+ # this should match one of the sections in the inference_parameters section above
|
|
|
parameter_set: sagemaker
|
|
|
+ # runs are done for each combination of payload file and concurrency level
|
|
|
payload_files:
|
|
|
- payload_en_1-500.jsonl
|
|
|
- payload_en_500-1000.jsonl
|
|
|
- payload_en_1000-2000.jsonl
|
|
|
- payload_en_2000-3000.jsonl
|
|
|
-
|
|
|
+ #- payload_en_3000-3840.jsonl
|
|
|
+
|
|
|
+ # concurrency level refers to number of requests sent in parallel to an endpoint
|
|
|
+ # the next set of requests is sent once responses for all concurrent requests have
|
|
|
+ # been received.
|
|
|
concurrency_levels:
|
|
|
- 1
|
|
|
- 2
|
|
|
- 4
|
|
|
-
|
|
|
+ # Added for models that require accepting a EULA
|
|
|
accept_eula: true
|
|
|
+ # Environment variables to be passed to the container
|
|
|
+ # this is not a fixed list, you can add more parameters as applicable.
|
|
|
env:
|
|
|
SAGEMAKER_PROGRAM: "inference.py"
|
|
|
ENDPOINT_SERVER_TIMEOUT: "3600"
|
|
@@ -249,7 +267,6 @@ report:
|
|
|
latency_budget: 2
|
|
|
cost_per_10k_txn_budget: 20
|
|
|
error_rate_budget: 0
|
|
|
-
|
|
|
# other misc reporting parameters, see 4_model_metric_analysis.ipynb
|
|
|
# for more information
|
|
|
per_inference_request_file: per_inference_request_results.csv
|