1 年之前 · 6bfd034504
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/README.md
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/README.md
@@ -22,15 +22,16 @@ Given those differences, the numbers from this recipe can not be compared to the
 
																 ## Environment setups
															
 
																-Please install lm-evaluation-harness and our llama-recipe repo by following:
															
 
																+Please install lm-evaluation-harness and our llama-cookbook repo by following:
															
 
																 ```
															
 
																 git clone git@github.com:meta-llama/llama-cookbook.git
															
 
																 cd llama-cookbook
															
 
																 pip install -U pip setuptools
															
 
																 pip install -e .
															
 
																+pip install -U antlr4_python3_runtime==4.11
															
 
																 pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
															
 
																-cd tools/benchmarks/llm_eval_harness/meta_eval
															
 
																+cd end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval
															
 
																 ```
															
 
																 To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,6 +1,6 @@
 
																-model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
															
 
																+model_name: "meta-llama/Llama-3.2-3B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
															
 
																-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
															
 
																+evals_dataset: "meta-llama/Llama-3.2-3B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
															
 
																 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
															
 
																 tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu_instruct.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu_instruct.yaml
@@ -0,0 +1,29 @@
 
																+task: meta_mmlu_instruct
															
 
																+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
															
 
																+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu__details
															
 
																+test_split: latest
															
 
																+output_type: generate_until
															
 
																+process_docs: !function utils.process_docs_instruct
															
 
																+doc_to_text: !function utils.doc_to_text_instruct
															
 
																+doc_to_target: gold
															
 
																+filter_list:
															
 
																+  - name: "strict-match"
															
 
																+    filter:
															
 
																+      - function: "regex"
															
 
																+        group_select: -1
															
 
																+        regex_pattern: ' ([A-D])'
															
 
																+      - function: "take_first"
															
 
																+generation_kwargs:
															
 
																+  until: []
															
 
																+  do_sample: false
															
 
																+  temperature: 0
															
 
																+  max_gen_toks: 1024
															
 
																+num_fewshot: 0
															
 
																+metric_list:
															
 
																+  - metric: exact_match
															
 
																+    aggregation: mean
															
 
																+    higher_is_better: true
															
 
																+    ignore_case: true
															
 
																+    ignore_punctuation: true
															
 
																+metadata:
															
 
																+  version: 1.0
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu.yaml
@@ -3,12 +3,12 @@ dataset_path: meta-llama/Llama-3.1-8B-evals
 
																 dataset_name: Llama-3.1-8B-evals__mmlu__details
															
 
																 test_split: latest
															
 
																 output_type: multiple_choice
															
 
																-process_docs: !function utils.process_docs
															
 
																-doc_to_text: !function utils.doc_to_text
															
 
																+process_docs: !function utils.process_docs_pretrain
															
 
																+doc_to_text: !function utils.doc_to_text_pretrain
															
 
																 doc_to_target: !function utils.doc_to_target
															
 
																 doc_to_choice: ["A", "B", "C", "D"]
															
 
																 # 5-shot prompts are already included in the dataset
															
 
																 # So no need to generate
															
 
																 num_fewshot: 0
															
 
																 metadata:
															
 
																-  version: 1.0
															
 
																+  version: 1.0
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/utils.py
@@ -1,12 +1,22 @@
 
																 import string
															
 
																+
															
 
																 import datasets
															
 
																-def doc_to_text(doc: dict) -> str:
															
 
																+
															
 
																+def doc_to_text_pretrain(doc: dict) -> str:
															
 
																     # Strip out the last two characters, which is a space and the answer
															
 
																     # E.g., "Answer: B" -> "Answer:"
															
 
																     return doc["input_final_prompts"][0][:-2]
															
 
																+    return text
															
 
																+
															
 
																+
															
 
																+def doc_to_text_instruct(doc: dict) -> str:
															
 
																+    # Strip out the last two characters, which is a space and the answer
															
 
																+    # E.g., "Answer: B" -> "Answer:"
															
 
																+    return doc["input_final_prompts"][0]
															
 
																+
															
 
																-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
															
 
																+def process_docs_pretrain(dataset: datasets.Dataset) -> datasets.Dataset:
															
 
																     def _process_doc(doc: dict) -> dict:
															
 
																         # input_correct_responses is in format of: "Answer: B"
															
 
																         answer = doc["input_correct_responses"][0]
															
@@ -21,11 +31,43 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
																         return out_doc
															
 
																     dataset = dataset.select_columns(
															
 
																-        ["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
															
 
																-         "input_choice_list"])
															
 
																+        [
															
 
																+            "input_question",
															
 
																+            "input_correct_responses",
															
 
																+            "input_final_prompts",
															
 
																+            "is_correct",
															
 
																+            "input_question_hash",
															
 
																+            "input_choice_list",
															
 
																+        ]
															
 
																+    )
															
 
																+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
															
 
																+    dataset = dataset.map(_process_doc)
															
 
																+    return dataset.map(_process_doc)
															
 
																+
															
 
																+
															
 
																+def process_docs_instruct(dataset: datasets.Dataset) -> datasets.Dataset:
															
 
																+    def _process_doc(doc: dict) -> dict:
															
 
																+        out_doc = {
															
 
																+            "problem": doc["input_question"],
															
 
																+            "gold": doc["input_correct_responses"][0],
															
 
																+        }
															
 
																+        return out_doc
															
 
																+
															
 
																+    dataset = dataset.select_columns(
															
 
																+        [
															
 
																+            "input_question",
															
 
																+            "input_correct_responses",
															
 
																+            "input_final_prompts",
															
 
																+            "is_correct",
															
 
																+            "input_question_hash",
															
 
																+            "input_choice_list",
															
 
																+            "output_prediction_text",
															
 
																+        ]
															
 
																+    )
															
 
																     dataset = dataset.rename_column("is_correct", "previously_is_correct")
															
 
																     dataset = dataset.map(_process_doc)
															
 
																     return dataset.map(_process_doc)
															
 
																+
															
 
																 def doc_to_target(doc: dict) -> str:
															
 
																     return doc["gold"]
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/utils.py
@@ -1,13 +1,12 @@
 
																 import string
															
 
																-
															
 
																 import datasets
															
 
																-
															
 
																 def doc_to_text(doc: dict) -> str:
															
 
																     return doc["input_final_prompts"][0]
															
 
																+
															
 
																 def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
															
 
																     def _process_doc(doc: dict) -> dict:
															
 
																         out_doc = {
															
@@ -15,7 +14,18 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
																             "gold": doc["input_correct_responses"][0],
															
 
																         }
															
 
																         return out_doc
															
 
																-    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
															
 
																-    dataset = dataset.rename_column("is_correct","previously_is_correct")
															
 
																+
															
 
																+    dataset = dataset.select_columns(
															
 
																+        [
															
 
																+            "input_question",
															
 
																+            "input_correct_responses",
															
 
																+            "input_final_prompts",
															
 
																+            "is_correct",
															
 
																+            "input_question_hash",
															
 
																+            "input_choice_list",
															
 
																+            "output_prediction_text",
															
 
																+        ]
															
 
																+    )
															
 
																+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
															
 
																     dataset = dataset.map(_process_doc)
															
 
																     return dataset.map(_process_doc)
															
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
@@ -3,33 +3,35 @@
 
																 import argparse
															
 
																 import errno
															
 
																-import shutil
															
 
																 import glob
															
 
																 import os
															
 
																+import shutil
															
 
																 from pathlib import Path
															
 
																+
															
 
																 import nltk
															
 
																 import yaml
															
 
																 from datasets import Dataset, load_dataset
															
 
																-LLAMA_3_1_INSTRUCT_EVALS=[
															
 
																+LLAMA_3_1_INSTRUCT_EVALS = [
															
 
																     "meta-llama/Llama-3.1-8B-Instruct-evals",
															
 
																     "meta-llama/Llama-3.1-70B-Instruct-evals",
															
 
																     "meta-llama/Llama-3.1-405B-Instruct-evals",
															
 
																 ]
															
 
																-LLAMA_3_1_PRETRAIN_EVALS=[
															
 
																+LLAMA_3_1_PRETRAIN_EVALS = [
															
 
																     "meta-llama/Llama-3.1-8B-evals",
															
 
																     "meta-llama/Llama-3.1-70B-evals",
															
 
																     "meta-llama/Llama-3.1-405B-evals",
															
 
																 ]
															
 
																-LLAMA_3_2_INSTRUCT_EVALS=[
															
 
																+LLAMA_3_2_INSTRUCT_EVALS = [
															
 
																     "meta-llama/Llama-3.2-1B-Instruct-evals",
															
 
																     "meta-llama/Llama-3.2-3B-Instruct-evals",
															
 
																 ]
															
 
																-LLAMA_3_2_PRETRAIN_EVALS=[
															
 
																+LLAMA_3_2_PRETRAIN_EVALS = [
															
 
																     "meta-llama/Llama-3.2-1B-evals",
															
 
																     "meta-llama/Llama-3.2-3B-evals",
															
 
																 ]
															
 
																+
															
 
																 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
															
 
																 def get_ifeval_data(model_name, output_dir):
															
 
																     print(f"preparing the ifeval data using {model_name}'s evals dataset")
															
@@ -37,9 +39,10 @@ def get_ifeval_data(model_name, output_dir):
 
																         "Llama-3.1-8B-Instruct",
															
 
																         "Llama-3.1-70B-Instruct",
															
 
																         "Llama-3.1-405B-Instruct",
															
 
																+        "Llama-3.3-70B-Instruct",
															
 
																     ]:
															
 
																         raise ValueError(
															
 
																-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
															
 
																+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
															
 
																         )
															
 
																     original_dataset_name = "wis-k/instruction-following-eval"
															
 
																     meta_dataset_name = f"meta-llama/{model_name}-evals"
															
@@ -80,11 +83,12 @@ def get_math_hard_data(model_name, output_dir):
 
																         "Llama-3.1-8B-Instruct",
															
 
																         "Llama-3.1-70B-Instruct",
															
 
																         "Llama-3.1-405B-Instruct",
															
 
																+        "Llama-3.3-70B-Instruct",
															
 
																     ]:
															
 
																         raise ValueError(
															
 
																-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
															
 
																+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
															
 
																         )
															
 
																-    original_dataset_name = "lighteval/MATH-Hard"
															
 
																+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
															
 
																     meta_dataset_name = f"meta-llama/{model_name}-evals"
															
 
																     meta_data = load_dataset(
															
 
																         meta_dataset_name,
															
@@ -95,6 +99,7 @@ def get_math_hard_data(model_name, output_dir):
 
																     joined = join_meta_and_original_math_data(meta_data, math_data)
															
 
																     joined.to_parquet(output_dir + "/joined_math_hard.parquet")
															
 
																+
															
 
																 def get_math_data(model_name, output_dir):
															
 
																     print(f"preparing the math data using {model_name}'s evals dataset")
															
 
																     if model_name not in [
															
@@ -104,7 +109,7 @@ def get_math_data(model_name, output_dir):
 
																         raise ValueError(
															
 
																             "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
															
 
																         )
															
 
																-    original_dataset_name = "lighteval/MATH"
															
 
																+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
															
 
																     meta_dataset_name = f"meta-llama/{model_name}-evals"
															
 
																     meta_data = load_dataset(
															
 
																         meta_dataset_name,
															
@@ -115,6 +120,7 @@ def get_math_data(model_name, output_dir):
 
																     joined = join_meta_and_original_math_data(meta_data, math_data)
															
 
																     joined.to_parquet(output_dir + "/joined_math.parquet")
															
 
																+
															
 
																 def join_meta_and_original_math_data(meta_data, math_data):
															
 
																     meta_df = meta_data.to_pandas()
															
 
																     math_df = math_data.to_pandas()
															
@@ -138,6 +144,7 @@ def join_meta_and_original_math_data(meta_data, math_data):
 
																     )
															
 
																     return joined
															
 
																+
															
 
																 # get the question from the ifeval dataset
															
 
																 def get_question(example):
															
 
																     try:
															
@@ -181,17 +188,22 @@ def change_yaml(args, base_name):
 
																     if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
															
 
																         meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
															
 
																     elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
															
 
																-        meta_pretrain["task"] = ["meta_mmlu"]
															
 
																+        meta_pretrain["task"] = ["meta_mmlu_pretrain"]
															
 
																     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
															
 
																         yaml.dump(meta_pretrain, yaml_file)
															
 
																-    
															
 
																+
															
 
																     # Update tasks in meta_instruct.yaml
															
 
																     with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
															
 
																         meta_instruct = yaml.safe_load(yaml_file)
															
 
																     if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
															
 
																-        meta_instruct["task"] = ["meta_ifeval", "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct"]
															
 
																+        meta_instruct["task"] = [
															
 
																+            "meta_ifeval",
															
 
																+            "meta_math_hard",
															
 
																+            "meta_gpqa_cot",
															
 
																+            "meta_mmlu_pro_instruct",
															
 
																+        ]
															
 
																     elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
															
 
																-        meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
															
 
																+        meta_instruct["task"] = ["meta_mmlu_instruct", "meta_math", "meta_gpqa"]
															
 
																     with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
															
 
																         yaml.dump(meta_instruct, yaml_file)
															
@@ -199,7 +211,7 @@ def change_yaml(args, base_name):
 
																 # copy the files and change the yaml file to use the correct model name
															
 
																 def copy_and_prepare(args):
															
 
																     # nltk punkt_tab package is needed
															
 
																-    nltk.download('punkt_tab')
															
 
																+    nltk.download("punkt_tab")
															
 
																     copy_dir(args.template_dir, args.work_dir)
															
 
																     # Use the template yaml to get the correct model name in work_dir yaml
															
 
																     base_name = (
															
@@ -227,7 +239,9 @@ def prepare_datasets(args):
 
																     if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
															
 
																         get_ifeval_data(model_name, args.work_dir)
															
 
																         get_math_hard_data(model_name, args.work_dir)
															
 
																-    elif "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
															
 
																+    elif (
															
 
																+        "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
															
 
																+    ):
															
 
																         get_math_data(model_name, args.work_dir)
															
 
																     else:
															
 
																         if "meta_ifeval" in task_list:
															
@@ -264,10 +278,10 @@ if __name__ == "__main__":
 
																     if not os.path.exists(args.template_dir):
															
 
																         raise ValueError("The template_dir does not exist, please check the path")
															
 
																     if args.evals_dataset not in (
															
 
																-        LLAMA_3_1_INSTRUCT_EVALS +
															
 
																-        LLAMA_3_1_PRETRAIN_EVALS +
															
 
																-        LLAMA_3_2_INSTRUCT_EVALS +
															
 
																-        LLAMA_3_2_PRETRAIN_EVALS
															
 
																+        LLAMA_3_1_INSTRUCT_EVALS
															
 
																+        + LLAMA_3_1_PRETRAIN_EVALS
															
 
																+        + LLAMA_3_2_INSTRUCT_EVALS
															
 
																+        + LLAMA_3_2_PRETRAIN_EVALS
															
 
																     ):
															
 
																         raise ValueError(
															
 
																             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."