eval_config.yaml 2.8 KB

1234567891011121314151617181920212223242526272829303132333435
  1. model_name: "meta-llama/Llama-3.2-3B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
  2. evals_dataset: "meta-llama/Llama-3.2-3B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
  3. # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
  4. tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
  5. # Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
  6. # Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
  7. # Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
  8. tensor_parallel_size: 1 # The VLLM argument that specify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
  9. data_parallel_size: 4 # The VLLM argument that specify the data parallel size for the model, eg how copies of model will be used.
  10. gpu_memory_utilization: 0.9 #The VLLM argument that specify gpu memory utilization, the rest will be reserved for KV cache.
  11. max_model_len: 8192 #The VLLM argument that specify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
  12. batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
  13. output_path: "eval_results" # the output folder to store all the eval results and samples.
  14. #limit: 12 # Limit number of examples per task, set 'null' to run all.
  15. limit: null # Limit number of examples per task, set 'null' to run all.
  16. verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
  17. log_samples: true # If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.
  18. work_dir: ./work_dir # The work folder where the task template yaml files will be copied and modified, datasets will be downloaded for math_hard, ifeval.
  19. template_dir: ./meta_template #Path to the folder that contains all the meta task templates
  20. show_config: false # If True, shows the full config of all tasks at the end of the evaluation.