1 năm trước cách đây · 4f9050f748
--- a/tools/benchmarks/llm_eval_harness/meta_eval/README.md
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,18 +1,19 @@
 
				 model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				-# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals"]
			
 
				+evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
			
 
				+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals"]
			
 
				 
			
 
				-tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				-# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+# Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+# Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
			
 
				 
			
 
				-tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				+tensor_parallel_size: 1 # The VLLM argument that specify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				 
			
 
				-data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				+data_parallel_size: 4 # The VLLM argument that specify the data parallel size for the model, eg how copies of model will be used.
			
 
				 
			
 
				-gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
			
 
				+gpu_memory_utilization: 0.9 #The VLLM argument that specify gpu memory utilization, the rest will be reserved for KV cache.
			
 
				 
			
 
				-max_model_len: 8192 #The VLLM argument that speicify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
			
 
				+max_model_len: 8192 #The VLLM argument that specify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
			
 
				 
			
 
				 batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
			
 
				 
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu.yaml
@@ -0,0 +1,14 @@
 
				+task: meta_mmlu
			
 
				+dataset_path: meta-llama/Llama-3.1-8B-evals
			
 
				+dataset_name: Llama-3.1-8B-evals__mmlu__details
			
 
				+test_split: latest
			
 
				+output_type: multiple_choice
			
 
				+process_docs: !function utils.process_docs
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: !function utils.doc_to_target
			
 
				+doc_to_choice: ["A", "B", "C", "D"]
			
 
				+# 5-shot prompts are already included in the dataset
			
 
				+# So no need to generate
			
 
				+num_fewshot: 0
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/utils.py
@@ -0,0 +1,31 @@
 
				+import string
			
 
				+import datasets
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    # Strip out the last two characters, which is a space and the answer
			
 
				+    # E.g., "Answer: B" -> "Answer:"
			
 
				+    return doc["input_final_prompts"][0][:-2]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        # input_correct_responses is in format of: "Answer: B"
			
 
				+        answer = doc["input_correct_responses"][0]
			
 
				+        # Indexes are always A: 0, B: 1, C: 2, D: 3
			
 
				+        answer_index = string.ascii_uppercase.index(answer[-1])
			
 
				+
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            # The answer is the index of the correct response (0-indexed)
			
 
				+            "gold": answer_index,
			
 
				+        }
			
 
				+        return out_doc
			
 
				+
			
 
				+    dataset = dataset.select_columns(
			
 
				+        ["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
			
 
				+         "input_choice_list"])
			
 
				+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
 
				+
			
 
				+def doc_to_target(doc: dict) -> str:
			
 
				+    return doc["gold"]
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
@@ -134,6 +134,19 @@ def change_yaml(args, base_name):
 
				                         "WORK_DIR", str(yaml_dir)
			
 
				                     )
			
 
				                 )
			
 
				+    # 3.2 evals dataset has a differents set of evals
			
 
				+    if args.evals_dataset in [
			
 
				+        "meta-llama/Llama-3.2-1B-evals",
			
 
				+        "meta-llama/Llama-3.2-3B-evals",
			
 
				+    ]:
			
 
				+        # Change meta_pretrain.yaml to load in supported evals
			
 
				+        with open(args.template_dir + "/meta_pretrain.yaml", "r") as yaml_file:
			
 
				+            meta_pretrain = yaml.safe_load(yaml_file)
			
 
				+            meta_pretrain["task"] = ["meta_mmlu"]
			
 
				+        
			
 
				+        with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
			
 
				+            yaml.dump(meta_pretrain, yaml_file)
			
 
				+
			
 
				 
			
 
				 
			
 
				 # copy the files and change the yaml file to use the correct model name
			
@@ -214,9 +227,11 @@ if __name__ == "__main__":
 
				         "meta-llama/Llama-3.1-8B-evals",
			
 
				         "meta-llama/Llama-3.1-70B-evals",
			
 
				         "meta-llama/Llama-3.1-405B-evals",
			
 
				+        "meta-llama/Llama-3.2-1B-evals",
			
 
				+        "meta-llama/Llama-3.2-3B-evals",
			
 
				     ]:
			
 
				         raise ValueError(
			
 
				-            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"
			
 
				+            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection. Note that 3.2-Instruct evals are not yet supported."
			
 
				         )
			
 
				     args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				     # Copy the all files from template folder to the work folder