| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 | general:  name: "llama2-7b-v1"        model_name: "Llama2-7b"  # AWS and SageMaker settingsaws:  # AWS region, this parameter is templatized, no need to change  region: {region}  # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change  sagemaker_execution_role: {role_arn}  # S3 bucket to which metrics, plots and reports would be written to  bucket: {write_bucket}# directory paths in the write bucket, no need to change thesedir_paths:  data_prefix: data  prompts_prefix: prompts  all_prompts_file: all_prompts.csv  metrics_dir: metrics  models_dir: models  metadata_dir: metadata# S3 information for reading datasets, scripts and tokenizers3_read_data:  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>  read_bucket: {read_bucket}  scripts_prefix: scripts    # S3 prefix in the read bucket where deployment and inference scripts should be placed  scripts_prefix: scripts      # deployment and inference script files to be downloaded are placed in this list  # only needed if you are creating a new deployment script or inference script  # your HuggingFace token does need to be in this list and should be called "hf_token.txt"  script_files:  - hf_token.txt  # configuration files (like this one) are placed in this prefix  configs_prefix: configs  # list of configuration files to download, for now only pricing.yml needs to be downloaded  config_files:  - pricing.yml  # S3 prefix for the dataset files  source_data_prefix: source_data  # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench  source_data_files:  - 2wikimqa_e.jsonl  - 2wikimqa.jsonl  - hotpotqa_e.jsonl  - hotpotqa.jsonl  - narrativeqa.jsonl  - triviaqa_e.jsonl  - triviaqa.jsonl  # S3 prefix for the tokenizer to be used with the models  # NOTE 1: the same tokenizer is used with all the models being tested through a config file  # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.  tokenizer_prefix: tokenizer    # S3 prefix for prompt templates  prompt_template_dir: prompt_template  # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file  # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one  prompt_template_file: prompt_template_llama2.txt# steps to run, usually all of these would be# set to yes so nothing needs to change here# you could, however, bypass some steps for example# set the 2_deploy_model.ipynb to no if you are re-running# the same config file and the model is already deployedrun_steps:  0_setup.ipynb: yes  1_generate_data.ipynb: yes  2_deploy_model.ipynb: yes  3_run_inference.ipynb: yes  4_model_metric_analysis.ipynb: yes  5_cleanup.ipynb: yesdatasets:  # Refer to the 1_generate_data.ipynb notebook  # the dataset you use is expected to have the   # columns you put in prompt_template_keys list  # and your prompt template also needs to have  # the same placeholders (refer to the prompt template folder)  prompt_template_keys:  - input  - context    # if your dataset has multiple languages and it has a language  # field then you could filter it for a language. Similarly,  # you can filter your dataset to only keep prompts between  # a certain token length limit (the token length is determined  # using the tokenizer you provide in the tokenizer_prefix prefix in the  # read S3 bucket). Each of the array entries below create a payload file  # containing prompts matching the language and token length criteria.  filters:  - language: en        min_length_in_tokens: 1    max_length_in_tokens: 500    payload_file: payload_en_1-500.jsonl  - language: en    min_length_in_tokens: 500    max_length_in_tokens: 1000    payload_file: payload_en_500-1000.jsonl  - language: en    min_length_in_tokens: 1000    max_length_in_tokens: 2000    payload_file: payload_en_1000-2000.jsonl  - language: en    min_length_in_tokens: 2000    max_length_in_tokens: 3000    payload_file: payload_en_2000-3000.jsonl  - language: en    min_length_in_tokens: 3000    max_length_in_tokens: 3840    payload_file: payload_en_3000-3840.jsonl# While the tests would run on all the datasets# configured in the experiment entries below but # the price:performance analysis is only done for 1# dataset which is listed below as the dataset_of_interestmetrics:  dataset_of_interest: en_2000-3000# all pricing information is in the pricing.yml file# this file is provided in the repo. You can add entries# to this file for new instance types and new Bedrock modelspricing: pricing.yml# inference parameters, these are added to the payload# for each inference request. The list here is not static# any parameter supported by the inference container can be# added to the list. Put the sagemaker parameters in the sagemaker# section, bedrock parameters in the bedrock section (not shown here).# Use the section name (sagemaker in this example) in the inference_spec.parameter_set# section under experiments.inference_parameters:  sagemaker:    do_sample: yes    temperature: 0.1    top_p: 0.92    top_k: 120      max_new_tokens: 100    return_full_text: False# Configuration for experiments to be run. The experiments section is an array# so more than one experiments can be added, these could belong to the same model# but different instance types, or different models, or even different hosting# options (such as one experiment is SageMaker and the other is Bedrock).experiments:  - name: llama2-7b-g5.xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0    # model_id is interpreted in conjunction with the deployment_script, so if you    # use a JumpStart model id then set the deployment_script to jumpstart.py.    # if deploying directly from HuggingFace this would be a HuggingFace model id    # see the DJL serving deployment script in the code repo for reference.    model_id: meta-textgeneration-llama-2-7b-f    model_version: "3.*"    model_name: llama2-7b-f    ep_name: llama-2-7b-g5xlarge    instance_type: "ml.g5.xlarge"    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'    deploy: yes    instance_count: 1    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.    # See repo for details        deployment_script: jumpstart.py    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker    # and Bedrock. You can also add your own. See repo for details    inference_script: sagemaker_predictor.py    inference_spec:      # this should match one of the sections in the inference_parameters section above      parameter_set: sagemaker    # runs are done for each combination of payload file and concurrency level    payload_files:    - payload_en_1-500.jsonl    - payload_en_500-1000.jsonl    - payload_en_1000-2000.jsonl    - payload_en_2000-3000.jsonl    #- payload_en_3000-3840.jsonl    # concurrency level refers to number of requests sent in parallel to an endpoint    # the next set of requests is sent once responses for all concurrent requests have    # been received.    concurrency_levels:    - 1    - 2    - 4    accept_eula: true    # Environment variables to be passed to the container    # this is not a fixed list, you can add more parameters as applicable.    env:      SAGEMAKER_PROGRAM: "inference.py"      ENDPOINT_SERVER_TIMEOUT: "3600"      MODEL_CACHE_ROOT: "/opt/ml/model"      SAGEMAKER_ENV: "1"      HF_MODEL_ID: "/opt/ml/model"      MAX_INPUT_LENGTH: "4095"      MAX_TOTAL_TOKENS: "4096"      SM_NUM_GPUS: "1"      SAGEMAKER_MODEL_SERVER_WORKERS: "1"  - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0    # model_id is interpreted in conjunction with the deployment_script, so if you    # use a JumpStart model id then set the deployment_script to jumpstart.py.    # if deploying directly from HuggingFace this would be a HuggingFace model id    # see the DJL serving deployment script in the code repo for reference.     model_id: meta-textgeneration-llama-2-7b-f    model_version: "3.*"    model_name: llama2-7b-f    ep_name: llama-2-7b-g5-2xlarge    instance_type: "ml.g5.2xlarge"    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'    deploy: yes    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.    # See repo for details    instance_count: 1    deployment_script: jumpstart.py    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker    # and Bedrock. You can also add your own. See repo for details    inference_script: sagemaker_predictor.py    inference_spec:      # this should match one of the sections in the inference_parameters section above      parameter_set: sagemaker    # runs are done for each combination of payload file and concurrency level    payload_files:    - payload_en_1-500.jsonl    - payload_en_500-1000.jsonl    - payload_en_1000-2000.jsonl    - payload_en_2000-3000.jsonl    #- payload_en_3000-3840.jsonl        # concurrency level refers to number of requests sent in parallel to an endpoint    # the next set of requests is sent once responses for all concurrent requests have    # been received.    concurrency_levels:    - 1    - 2    - 4    # Added for models that require accepting a EULA    accept_eula: true    # Environment variables to be passed to the container    # this is not a fixed list, you can add more parameters as applicable.    env:      SAGEMAKER_PROGRAM: "inference.py"      ENDPOINT_SERVER_TIMEOUT: "3600"      MODEL_CACHE_ROOT: "/opt/ml/model"      SAGEMAKER_ENV: "1"      HF_MODEL_ID: "/opt/ml/model"      MAX_INPUT_LENGTH: "4095"      MAX_TOTAL_TOKENS: "4096"      SM_NUM_GPUS: "1"      SAGEMAKER_MODEL_SERVER_WORKERS: "1"# parameters related to how the final report is generatedreport:  # constraints for latency, cost and error rate  # an experiment is considered successful or eligible for  # selection for a use-case if it satisfies all of the following  # constraints. Experiments are scored as per this criteria  # higher score is better (see 4_model_metric_analysis.ipynb score_run function)  latency_budget: 2  cost_per_10k_txn_budget: 20  error_rate_budget: 0  # other misc reporting parameters, see 4_model_metric_analysis.ipynb  # for more information  per_inference_request_file: per_inference_request_results.csv  all_metrics_file: all_metrics.csv  txn_count_for_showing_cost: 10000  v_shift_w_single_instance: 0.025  v_shift_w_gt_one_instance: 0.025
 |