Преглед на файлове

fix meta_eval after refactor and add new meta_mmlu_instruct task for 3.2 (#862)

Sanyam Bhutani преди 4 месеца
родител
ревизия
6bfd034504

+ 3 - 2
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/README.md

@@ -22,15 +22,16 @@ Given those differences, the numbers from this recipe can not be compared to the
 
 
 ## Environment setups
 ## Environment setups
 
 
-Please install lm-evaluation-harness and our llama-recipe repo by following:
+Please install lm-evaluation-harness and our llama-cookbook repo by following:
 
 
 ```
 ```
 git clone git@github.com:meta-llama/llama-cookbook.git
 git clone git@github.com:meta-llama/llama-cookbook.git
 cd llama-cookbook
 cd llama-cookbook
 pip install -U pip setuptools
 pip install -U pip setuptools
 pip install -e .
 pip install -e .
+pip install -U antlr4_python3_runtime==4.11
 pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
 pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
-cd tools/benchmarks/llm_eval_harness/meta_eval
+cd end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval
 ```
 ```
 
 
 To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:
 To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:

+ 2 - 2
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml

@@ -1,6 +1,6 @@
-model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Llama-3.2-3B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
 
-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
+evals_dataset: "meta-llama/Llama-3.2-3B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
 
 
 tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.

+ 29 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu_instruct.yaml

@@ -0,0 +1,29 @@
+task: meta_mmlu_instruct
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs_instruct
+doc_to_text: !function utils.doc_to_text_instruct
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: ' ([A-D])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 3 - 3
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/mmlu.yaml

@@ -3,12 +3,12 @@ dataset_path: meta-llama/Llama-3.1-8B-evals
 dataset_name: Llama-3.1-8B-evals__mmlu__details
 dataset_name: Llama-3.1-8B-evals__mmlu__details
 test_split: latest
 test_split: latest
 output_type: multiple_choice
 output_type: multiple_choice
-process_docs: !function utils.process_docs
-doc_to_text: !function utils.doc_to_text
+process_docs: !function utils.process_docs_pretrain
+doc_to_text: !function utils.doc_to_text_pretrain
 doc_to_target: !function utils.doc_to_target
 doc_to_target: !function utils.doc_to_target
 doc_to_choice: ["A", "B", "C", "D"]
 doc_to_choice: ["A", "B", "C", "D"]
 # 5-shot prompts are already included in the dataset
 # 5-shot prompts are already included in the dataset
 # So no need to generate
 # So no need to generate
 num_fewshot: 0
 num_fewshot: 0
 metadata:
 metadata:
-  version: 1.0
+  version: 1.0

+ 46 - 4
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu/utils.py

@@ -1,12 +1,22 @@
 import string
 import string
+
 import datasets
 import datasets
 
 
-def doc_to_text(doc: dict) -> str:
+
+def doc_to_text_pretrain(doc: dict) -> str:
     # Strip out the last two characters, which is a space and the answer
     # Strip out the last two characters, which is a space and the answer
     # E.g., "Answer: B" -> "Answer:"
     # E.g., "Answer: B" -> "Answer:"
     return doc["input_final_prompts"][0][:-2]
     return doc["input_final_prompts"][0][:-2]
+    return text
+
+
+def doc_to_text_instruct(doc: dict) -> str:
+    # Strip out the last two characters, which is a space and the answer
+    # E.g., "Answer: B" -> "Answer:"
+    return doc["input_final_prompts"][0]
+
 
 
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+def process_docs_pretrain(dataset: datasets.Dataset) -> datasets.Dataset:
     def _process_doc(doc: dict) -> dict:
     def _process_doc(doc: dict) -> dict:
         # input_correct_responses is in format of: "Answer: B"
         # input_correct_responses is in format of: "Answer: B"
         answer = doc["input_correct_responses"][0]
         answer = doc["input_correct_responses"][0]
@@ -21,11 +31,43 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
         return out_doc
         return out_doc
 
 
     dataset = dataset.select_columns(
     dataset = dataset.select_columns(
-        ["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
-         "input_choice_list"])
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+        ]
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
+
+
+def process_docs_instruct(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ]
+    )
     dataset = dataset.rename_column("is_correct", "previously_is_correct")
     dataset = dataset.rename_column("is_correct", "previously_is_correct")
     dataset = dataset.map(_process_doc)
     dataset = dataset.map(_process_doc)
     return dataset.map(_process_doc)
     return dataset.map(_process_doc)
 
 
+
 def doc_to_target(doc: dict) -> str:
 def doc_to_target(doc: dict) -> str:
     return doc["gold"]
     return doc["gold"]

+ 14 - 4
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/utils.py

@@ -1,13 +1,12 @@
 import string
 import string
 
 
-
 import datasets
 import datasets
 
 
 
 
-
 def doc_to_text(doc: dict) -> str:
 def doc_to_text(doc: dict) -> str:
     return doc["input_final_prompts"][0]
     return doc["input_final_prompts"][0]
 
 
+
 def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
     def _process_doc(doc: dict) -> dict:
     def _process_doc(doc: dict) -> dict:
         out_doc = {
         out_doc = {
@@ -15,7 +14,18 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
             "gold": doc["input_correct_responses"][0],
             "gold": doc["input_correct_responses"][0],
         }
         }
         return out_doc
         return out_doc
-    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
-    dataset = dataset.rename_column("is_correct","previously_is_correct")
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ]
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
     dataset = dataset.map(_process_doc)
     dataset = dataset.map(_process_doc)
     return dataset.map(_process_doc)
     return dataset.map(_process_doc)

+ 33 - 19
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py

@@ -3,33 +3,35 @@
 
 
 import argparse
 import argparse
 import errno
 import errno
-import shutil
 import glob
 import glob
 import os
 import os
+import shutil
 from pathlib import Path
 from pathlib import Path
+
 import nltk
 import nltk
 import yaml
 import yaml
 from datasets import Dataset, load_dataset
 from datasets import Dataset, load_dataset
 
 
-LLAMA_3_1_INSTRUCT_EVALS=[
+LLAMA_3_1_INSTRUCT_EVALS = [
     "meta-llama/Llama-3.1-8B-Instruct-evals",
     "meta-llama/Llama-3.1-8B-Instruct-evals",
     "meta-llama/Llama-3.1-70B-Instruct-evals",
     "meta-llama/Llama-3.1-70B-Instruct-evals",
     "meta-llama/Llama-3.1-405B-Instruct-evals",
     "meta-llama/Llama-3.1-405B-Instruct-evals",
 ]
 ]
-LLAMA_3_1_PRETRAIN_EVALS=[
+LLAMA_3_1_PRETRAIN_EVALS = [
     "meta-llama/Llama-3.1-8B-evals",
     "meta-llama/Llama-3.1-8B-evals",
     "meta-llama/Llama-3.1-70B-evals",
     "meta-llama/Llama-3.1-70B-evals",
     "meta-llama/Llama-3.1-405B-evals",
     "meta-llama/Llama-3.1-405B-evals",
 ]
 ]
-LLAMA_3_2_INSTRUCT_EVALS=[
+LLAMA_3_2_INSTRUCT_EVALS = [
     "meta-llama/Llama-3.2-1B-Instruct-evals",
     "meta-llama/Llama-3.2-1B-Instruct-evals",
     "meta-llama/Llama-3.2-3B-Instruct-evals",
     "meta-llama/Llama-3.2-3B-Instruct-evals",
 ]
 ]
-LLAMA_3_2_PRETRAIN_EVALS=[
+LLAMA_3_2_PRETRAIN_EVALS = [
     "meta-llama/Llama-3.2-1B-evals",
     "meta-llama/Llama-3.2-1B-evals",
     "meta-llama/Llama-3.2-3B-evals",
     "meta-llama/Llama-3.2-3B-evals",
 ]
 ]
 
 
+
 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
 def get_ifeval_data(model_name, output_dir):
 def get_ifeval_data(model_name, output_dir):
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
@@ -37,9 +39,10 @@ def get_ifeval_data(model_name, output_dir):
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-405B-Instruct",
         "Llama-3.1-405B-Instruct",
+        "Llama-3.3-70B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
         )
         )
     original_dataset_name = "wis-k/instruction-following-eval"
     original_dataset_name = "wis-k/instruction-following-eval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -80,11 +83,12 @@ def get_math_hard_data(model_name, output_dir):
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-405B-Instruct",
         "Llama-3.1-405B-Instruct",
+        "Llama-3.3-70B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
         )
         )
-    original_dataset_name = "lighteval/MATH-Hard"
+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_data = load_dataset(
     meta_data = load_dataset(
         meta_dataset_name,
         meta_dataset_name,
@@ -95,6 +99,7 @@ def get_math_hard_data(model_name, output_dir):
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined.to_parquet(output_dir + "/joined_math_hard.parquet")
     joined.to_parquet(output_dir + "/joined_math_hard.parquet")
 
 
+
 def get_math_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
     print(f"preparing the math data using {model_name}'s evals dataset")
     print(f"preparing the math data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
@@ -104,7 +109,7 @@ def get_math_data(model_name, output_dir):
         raise ValueError(
         raise ValueError(
             "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
             "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
         )
         )
-    original_dataset_name = "lighteval/MATH"
+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_data = load_dataset(
     meta_data = load_dataset(
         meta_dataset_name,
         meta_dataset_name,
@@ -115,6 +120,7 @@ def get_math_data(model_name, output_dir):
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined.to_parquet(output_dir + "/joined_math.parquet")
     joined.to_parquet(output_dir + "/joined_math.parquet")
 
 
+
 def join_meta_and_original_math_data(meta_data, math_data):
 def join_meta_and_original_math_data(meta_data, math_data):
     meta_df = meta_data.to_pandas()
     meta_df = meta_data.to_pandas()
     math_df = math_data.to_pandas()
     math_df = math_data.to_pandas()
@@ -138,6 +144,7 @@ def join_meta_and_original_math_data(meta_data, math_data):
     )
     )
     return joined
     return joined
 
 
+
 # get the question from the ifeval dataset
 # get the question from the ifeval dataset
 def get_question(example):
 def get_question(example):
     try:
     try:
@@ -181,17 +188,22 @@ def change_yaml(args, base_name):
     if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
     if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
         meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
         meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
     elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
     elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
-        meta_pretrain["task"] = ["meta_mmlu"]
+        meta_pretrain["task"] = ["meta_mmlu_pretrain"]
     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
         yaml.dump(meta_pretrain, yaml_file)
         yaml.dump(meta_pretrain, yaml_file)
-    
+
     # Update tasks in meta_instruct.yaml
     # Update tasks in meta_instruct.yaml
     with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
     with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
         meta_instruct = yaml.safe_load(yaml_file)
         meta_instruct = yaml.safe_load(yaml_file)
     if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
     if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
-        meta_instruct["task"] = ["meta_ifeval", "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct"]
+        meta_instruct["task"] = [
+            "meta_ifeval",
+            "meta_math_hard",
+            "meta_gpqa_cot",
+            "meta_mmlu_pro_instruct",
+        ]
     elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
     elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
-        meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
+        meta_instruct["task"] = ["meta_mmlu_instruct", "meta_math", "meta_gpqa"]
     with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
     with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
         yaml.dump(meta_instruct, yaml_file)
         yaml.dump(meta_instruct, yaml_file)
 
 
@@ -199,7 +211,7 @@ def change_yaml(args, base_name):
 # copy the files and change the yaml file to use the correct model name
 # copy the files and change the yaml file to use the correct model name
 def copy_and_prepare(args):
 def copy_and_prepare(args):
     # nltk punkt_tab package is needed
     # nltk punkt_tab package is needed
-    nltk.download('punkt_tab')
+    nltk.download("punkt_tab")
     copy_dir(args.template_dir, args.work_dir)
     copy_dir(args.template_dir, args.work_dir)
     # Use the template yaml to get the correct model name in work_dir yaml
     # Use the template yaml to get the correct model name in work_dir yaml
     base_name = (
     base_name = (
@@ -227,7 +239,9 @@ def prepare_datasets(args):
     if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
     if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
         get_ifeval_data(model_name, args.work_dir)
         get_ifeval_data(model_name, args.work_dir)
         get_math_hard_data(model_name, args.work_dir)
         get_math_hard_data(model_name, args.work_dir)
-    elif "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
+    elif (
+        "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
+    ):
         get_math_data(model_name, args.work_dir)
         get_math_data(model_name, args.work_dir)
     else:
     else:
         if "meta_ifeval" in task_list:
         if "meta_ifeval" in task_list:
@@ -264,10 +278,10 @@ if __name__ == "__main__":
     if not os.path.exists(args.template_dir):
     if not os.path.exists(args.template_dir):
         raise ValueError("The template_dir does not exist, please check the path")
         raise ValueError("The template_dir does not exist, please check the path")
     if args.evals_dataset not in (
     if args.evals_dataset not in (
-        LLAMA_3_1_INSTRUCT_EVALS +
-        LLAMA_3_1_PRETRAIN_EVALS +
-        LLAMA_3_2_INSTRUCT_EVALS +
-        LLAMA_3_2_PRETRAIN_EVALS
+        LLAMA_3_1_INSTRUCT_EVALS
+        + LLAMA_3_1_PRETRAIN_EVALS
+        + LLAMA_3_2_INSTRUCT_EVALS
+        + LLAMA_3_2_PRETRAIN_EVALS
     ):
     ):
         raise ValueError(
         raise ValueError(
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."