config.yml 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. general:
  2. name: "llama2-7b-v1"
  3. model_name: "Llama2-7b"
  4. # AWS and SageMaker settings
  5. aws:
  6. # AWS region, this parameter is templatized, no need to change
  7. region: {region}
  8. # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
  9. sagemaker_execution_role: {role_arn}
  10. # S3 bucket to which metrics, plots and reports would be written to
  11. bucket: {write_bucket}
  12. # directory paths in the write bucket, no need to change these
  13. dir_paths:
  14. data_prefix: data
  15. prompts_prefix: prompts
  16. all_prompts_file: all_prompts.csv
  17. metrics_dir: metrics
  18. models_dir: models
  19. metadata_dir: metadata
  20. # S3 information for reading datasets, scripts and tokenizer
  21. s3_read_data:
  22. # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>
  23. read_bucket: {read_bucket}
  24. scripts_prefix: scripts
  25. # S3 prefix in the read bucket where deployment and inference scripts should be placed
  26. scripts_prefix: scripts
  27. # deployment and inference script files to be downloaded are placed in this list
  28. # only needed if you are creating a new deployment script or inference script
  29. # your HuggingFace token does need to be in this list and should be called "hf_token.txt"
  30. script_files:
  31. - hf_token.txt
  32. # configuration files (like this one) are placed in this prefix
  33. configs_prefix: configs
  34. # list of configuration files to download, for now only pricing.yml needs to be downloaded
  35. config_files:
  36. - pricing.yml
  37. # S3 prefix for the dataset files
  38. source_data_prefix: source_data
  39. # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench
  40. source_data_files:
  41. - 2wikimqa_e.jsonl
  42. - 2wikimqa.jsonl
  43. - hotpotqa_e.jsonl
  44. - hotpotqa.jsonl
  45. - narrativeqa.jsonl
  46. - triviaqa_e.jsonl
  47. - triviaqa.jsonl
  48. # S3 prefix for the tokenizer to be used with the models
  49. # NOTE 1: the same tokenizer is used with all the models being tested through a config file
  50. # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
  51. # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.
  52. tokenizer_prefix: tokenizer
  53. # S3 prefix for prompt templates
  54. prompt_template_dir: prompt_template
  55. # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file
  56. # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one
  57. prompt_template_file: prompt_template_llama2.txt
  58. # steps to run, usually all of these would be
  59. # set to yes so nothing needs to change here
  60. # you could, however, bypass some steps for example
  61. # set the 2_deploy_model.ipynb to no if you are re-running
  62. # the same config file and the model is already deployed
  63. run_steps:
  64. 0_setup.ipynb: yes
  65. 1_generate_data.ipynb: yes
  66. 2_deploy_model.ipynb: yes
  67. 3_run_inference.ipynb: yes
  68. 4_model_metric_analysis.ipynb: yes
  69. 5_cleanup.ipynb: yes
  70. datasets:
  71. # Refer to the 1_generate_data.ipynb notebook
  72. # the dataset you use is expected to have the
  73. # columns you put in prompt_template_keys list
  74. # and your prompt template also needs to have
  75. # the same placeholders (refer to the prompt template folder)
  76. prompt_template_keys:
  77. - input
  78. - context
  79. # if your dataset has multiple languages and it has a language
  80. # field then you could filter it for a language. Similarly,
  81. # you can filter your dataset to only keep prompts between
  82. # a certain token length limit (the token length is determined
  83. # using the tokenizer you provide in the tokenizer_prefix prefix in the
  84. # read S3 bucket). Each of the array entries below create a payload file
  85. # containing prompts matching the language and token length criteria.
  86. filters:
  87. - language: en
  88. min_length_in_tokens: 1
  89. max_length_in_tokens: 500
  90. payload_file: payload_en_1-500.jsonl
  91. - language: en
  92. min_length_in_tokens: 500
  93. max_length_in_tokens: 1000
  94. payload_file: payload_en_500-1000.jsonl
  95. - language: en
  96. min_length_in_tokens: 1000
  97. max_length_in_tokens: 2000
  98. payload_file: payload_en_1000-2000.jsonl
  99. - language: en
  100. min_length_in_tokens: 2000
  101. max_length_in_tokens: 3000
  102. payload_file: payload_en_2000-3000.jsonl
  103. - language: en
  104. min_length_in_tokens: 3000
  105. max_length_in_tokens: 3840
  106. payload_file: payload_en_3000-3840.jsonl
  107. # While the tests would run on all the datasets
  108. # configured in the experiment entries below but
  109. # the price:performance analysis is only done for 1
  110. # dataset which is listed below as the dataset_of_interest
  111. metrics:
  112. dataset_of_interest: en_2000-3000
  113. # all pricing information is in the pricing.yml file
  114. # this file is provided in the repo. You can add entries
  115. # to this file for new instance types and new Bedrock models
  116. pricing: pricing.yml
  117. # inference parameters, these are added to the payload
  118. # for each inference request. The list here is not static
  119. # any parameter supported by the inference container can be
  120. # added to the list. Put the sagemaker parameters in the sagemaker
  121. # section, bedrock parameters in the bedrock section (not shown here).
  122. # Use the section name (sagemaker in this example) in the inference_spec.parameter_set
  123. # section under experiments.
  124. inference_parameters:
  125. sagemaker:
  126. do_sample: yes
  127. temperature: 0.1
  128. top_p: 0.92
  129. top_k: 120
  130. max_new_tokens: 100
  131. return_full_text: False
  132. # Configuration for experiments to be run. The experiments section is an array
  133. # so more than one experiments can be added, these could belong to the same model
  134. # but different instance types, or different models, or even different hosting
  135. # options (such as one experiment is SageMaker and the other is Bedrock).
  136. experiments:
  137. - name: llama2-7b-g5.xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
  138. # model_id is interpreted in conjunction with the deployment_script, so if you
  139. # use a JumpStart model id then set the deployment_script to jumpstart.py.
  140. # if deploying directly from HuggingFace this would be a HuggingFace model id
  141. # see the DJL serving deployment script in the code repo for reference.
  142. model_id: meta-textgeneration-llama-2-7b-f
  143. model_version: "3.*"
  144. model_name: llama2-7b-f
  145. ep_name: llama-2-7b-g5xlarge
  146. instance_type: "ml.g5.xlarge"
  147. image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
  148. deploy: yes
  149. instance_count: 1
  150. # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
  151. # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
  152. # See repo for details
  153. deployment_script: jumpstart.py
  154. # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
  155. # and Bedrock. You can also add your own. See repo for details
  156. inference_script: sagemaker_predictor.py
  157. inference_spec:
  158. # this should match one of the sections in the inference_parameters section above
  159. parameter_set: sagemaker
  160. # runs are done for each combination of payload file and concurrency level
  161. payload_files:
  162. - payload_en_1-500.jsonl
  163. - payload_en_500-1000.jsonl
  164. - payload_en_1000-2000.jsonl
  165. - payload_en_2000-3000.jsonl
  166. #- payload_en_3000-3840.jsonl
  167. # concurrency level refers to number of requests sent in parallel to an endpoint
  168. # the next set of requests is sent once responses for all concurrent requests have
  169. # been received.
  170. concurrency_levels:
  171. - 1
  172. - 2
  173. - 4
  174. accept_eula: true
  175. # Environment variables to be passed to the container
  176. # this is not a fixed list, you can add more parameters as applicable.
  177. env:
  178. SAGEMAKER_PROGRAM: "inference.py"
  179. ENDPOINT_SERVER_TIMEOUT: "3600"
  180. MODEL_CACHE_ROOT: "/opt/ml/model"
  181. SAGEMAKER_ENV: "1"
  182. HF_MODEL_ID: "/opt/ml/model"
  183. MAX_INPUT_LENGTH: "4095"
  184. MAX_TOTAL_TOKENS: "4096"
  185. SM_NUM_GPUS: "1"
  186. SAGEMAKER_MODEL_SERVER_WORKERS: "1"
  187. - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
  188. # model_id is interpreted in conjunction with the deployment_script, so if you
  189. # use a JumpStart model id then set the deployment_script to jumpstart.py.
  190. # if deploying directly from HuggingFace this would be a HuggingFace model id
  191. # see the DJL serving deployment script in the code repo for reference.
  192. model_id: meta-textgeneration-llama-2-7b-f
  193. model_version: "3.*"
  194. model_name: llama2-7b-f
  195. ep_name: llama-2-7b-g5-2xlarge
  196. instance_type: "ml.g5.2xlarge"
  197. image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
  198. deploy: yes
  199. # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
  200. # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
  201. # See repo for details
  202. instance_count: 1
  203. deployment_script: jumpstart.py
  204. # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
  205. # and Bedrock. You can also add your own. See repo for details
  206. inference_script: sagemaker_predictor.py
  207. inference_spec:
  208. # this should match one of the sections in the inference_parameters section above
  209. parameter_set: sagemaker
  210. # runs are done for each combination of payload file and concurrency level
  211. payload_files:
  212. - payload_en_1-500.jsonl
  213. - payload_en_500-1000.jsonl
  214. - payload_en_1000-2000.jsonl
  215. - payload_en_2000-3000.jsonl
  216. #- payload_en_3000-3840.jsonl
  217. # concurrency level refers to number of requests sent in parallel to an endpoint
  218. # the next set of requests is sent once responses for all concurrent requests have
  219. # been received.
  220. concurrency_levels:
  221. - 1
  222. - 2
  223. - 4
  224. # Added for models that require accepting a EULA
  225. accept_eula: true
  226. # Environment variables to be passed to the container
  227. # this is not a fixed list, you can add more parameters as applicable.
  228. env:
  229. SAGEMAKER_PROGRAM: "inference.py"
  230. ENDPOINT_SERVER_TIMEOUT: "3600"
  231. MODEL_CACHE_ROOT: "/opt/ml/model"
  232. SAGEMAKER_ENV: "1"
  233. HF_MODEL_ID: "/opt/ml/model"
  234. MAX_INPUT_LENGTH: "4095"
  235. MAX_TOTAL_TOKENS: "4096"
  236. SM_NUM_GPUS: "1"
  237. SAGEMAKER_MODEL_SERVER_WORKERS: "1"
  238. # parameters related to how the final report is generated
  239. report:
  240. # constraints for latency, cost and error rate
  241. # an experiment is considered successful or eligible for
  242. # selection for a use-case if it satisfies all of the following
  243. # constraints. Experiments are scored as per this criteria
  244. # higher score is better (see 4_model_metric_analysis.ipynb score_run function)
  245. latency_budget: 2
  246. cost_per_10k_txn_budget: 20
  247. error_rate_budget: 0
  248. # other misc reporting parameters, see 4_model_metric_analysis.ipynb
  249. # for more information
  250. per_inference_request_file: per_inference_request_results.csv
  251. all_metrics_file: all_metrics.csv
  252. txn_count_for_showing_cost: 10000
  253. v_shift_w_single_instance: 0.025
  254. v_shift_w_gt_one_instance: 0.025