瀏覽代碼

fix meta_eval

Kai Wu 3 月之前
父節點
當前提交
ef939b2f28

+ 5 - 4
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/README.md

@@ -22,15 +22,16 @@ Given those differences, the numbers from this recipe can not be compared to the
 
 
 ## Environment setups
 ## Environment setups
 
 
-Please install lm-evaluation-harness and our llama-recipe repo by following:
+Please install lm-evaluation-harness and our llama-cookbook repo by following:
 
 
 ```
 ```
-git clone git@github.com:meta-llama/llama-recipes.git
-cd llama-recipes
+git clone git@github.com:meta-llama/llama-cookbook.git
+cd llama-cookbook
 pip install -U pip setuptools
 pip install -U pip setuptools
 pip install -e .
 pip install -e .
+pip install -U antlr4_python3_runtime==4.11
 pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
 pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
-cd tools/benchmarks/llm_eval_harness/meta_eval
+cd end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval
 ```
 ```
 
 
 To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:
 To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:

+ 31 - 17
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py

@@ -3,33 +3,35 @@
 
 
 import argparse
 import argparse
 import errno
 import errno
-import shutil
 import glob
 import glob
 import os
 import os
+import shutil
 from pathlib import Path
 from pathlib import Path
+
 import nltk
 import nltk
 import yaml
 import yaml
 from datasets import Dataset, load_dataset
 from datasets import Dataset, load_dataset
 
 
-LLAMA_3_1_INSTRUCT_EVALS=[
+LLAMA_3_1_INSTRUCT_EVALS = [
     "meta-llama/Llama-3.1-8B-Instruct-evals",
     "meta-llama/Llama-3.1-8B-Instruct-evals",
     "meta-llama/Llama-3.1-70B-Instruct-evals",
     "meta-llama/Llama-3.1-70B-Instruct-evals",
     "meta-llama/Llama-3.1-405B-Instruct-evals",
     "meta-llama/Llama-3.1-405B-Instruct-evals",
 ]
 ]
-LLAMA_3_1_PRETRAIN_EVALS=[
+LLAMA_3_1_PRETRAIN_EVALS = [
     "meta-llama/Llama-3.1-8B-evals",
     "meta-llama/Llama-3.1-8B-evals",
     "meta-llama/Llama-3.1-70B-evals",
     "meta-llama/Llama-3.1-70B-evals",
     "meta-llama/Llama-3.1-405B-evals",
     "meta-llama/Llama-3.1-405B-evals",
 ]
 ]
-LLAMA_3_2_INSTRUCT_EVALS=[
+LLAMA_3_2_INSTRUCT_EVALS = [
     "meta-llama/Llama-3.2-1B-Instruct-evals",
     "meta-llama/Llama-3.2-1B-Instruct-evals",
     "meta-llama/Llama-3.2-3B-Instruct-evals",
     "meta-llama/Llama-3.2-3B-Instruct-evals",
 ]
 ]
-LLAMA_3_2_PRETRAIN_EVALS=[
+LLAMA_3_2_PRETRAIN_EVALS = [
     "meta-llama/Llama-3.2-1B-evals",
     "meta-llama/Llama-3.2-1B-evals",
     "meta-llama/Llama-3.2-3B-evals",
     "meta-llama/Llama-3.2-3B-evals",
 ]
 ]
 
 
+
 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
 def get_ifeval_data(model_name, output_dir):
 def get_ifeval_data(model_name, output_dir):
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
@@ -37,9 +39,10 @@ def get_ifeval_data(model_name, output_dir):
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-405B-Instruct",
         "Llama-3.1-405B-Instruct",
+        "Llama-3.3-70B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
         )
         )
     original_dataset_name = "wis-k/instruction-following-eval"
     original_dataset_name = "wis-k/instruction-following-eval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -80,11 +83,12 @@ def get_math_hard_data(model_name, output_dir):
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-8B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-70B-Instruct",
         "Llama-3.1-405B-Instruct",
         "Llama-3.1-405B-Instruct",
+        "Llama-3.3-70B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
         )
         )
-    original_dataset_name = "lighteval/MATH-Hard"
+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_data = load_dataset(
     meta_data = load_dataset(
         meta_dataset_name,
         meta_dataset_name,
@@ -95,6 +99,7 @@ def get_math_hard_data(model_name, output_dir):
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined.to_parquet(output_dir + "/joined_math_hard.parquet")
     joined.to_parquet(output_dir + "/joined_math_hard.parquet")
 
 
+
 def get_math_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
     print(f"preparing the math data using {model_name}'s evals dataset")
     print(f"preparing the math data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
@@ -104,7 +109,7 @@ def get_math_data(model_name, output_dir):
         raise ValueError(
         raise ValueError(
             "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
             "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
         )
         )
-    original_dataset_name = "lighteval/MATH"
+    original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_data = load_dataset(
     meta_data = load_dataset(
         meta_dataset_name,
         meta_dataset_name,
@@ -115,6 +120,7 @@ def get_math_data(model_name, output_dir):
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined = join_meta_and_original_math_data(meta_data, math_data)
     joined.to_parquet(output_dir + "/joined_math.parquet")
     joined.to_parquet(output_dir + "/joined_math.parquet")
 
 
+
 def join_meta_and_original_math_data(meta_data, math_data):
 def join_meta_and_original_math_data(meta_data, math_data):
     meta_df = meta_data.to_pandas()
     meta_df = meta_data.to_pandas()
     math_df = math_data.to_pandas()
     math_df = math_data.to_pandas()
@@ -138,6 +144,7 @@ def join_meta_and_original_math_data(meta_data, math_data):
     )
     )
     return joined
     return joined
 
 
+
 # get the question from the ifeval dataset
 # get the question from the ifeval dataset
 def get_question(example):
 def get_question(example):
     try:
     try:
@@ -184,12 +191,17 @@ def change_yaml(args, base_name):
         meta_pretrain["task"] = ["meta_mmlu"]
         meta_pretrain["task"] = ["meta_mmlu"]
     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
         yaml.dump(meta_pretrain, yaml_file)
         yaml.dump(meta_pretrain, yaml_file)
-    
+
     # Update tasks in meta_instruct.yaml
     # Update tasks in meta_instruct.yaml
     with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
     with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
         meta_instruct = yaml.safe_load(yaml_file)
         meta_instruct = yaml.safe_load(yaml_file)
     if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
     if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
-        meta_instruct["task"] = ["meta_ifeval", "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct"]
+        meta_instruct["task"] = [
+            "meta_ifeval",
+            "meta_math_hard",
+            "meta_gpqa_cot",
+            "meta_mmlu_pro_instruct",
+        ]
     elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
     elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
         meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
         meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
     with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
     with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
@@ -199,7 +211,7 @@ def change_yaml(args, base_name):
 # copy the files and change the yaml file to use the correct model name
 # copy the files and change the yaml file to use the correct model name
 def copy_and_prepare(args):
 def copy_and_prepare(args):
     # nltk punkt_tab package is needed
     # nltk punkt_tab package is needed
-    nltk.download('punkt_tab')
+    nltk.download("punkt_tab")
     copy_dir(args.template_dir, args.work_dir)
     copy_dir(args.template_dir, args.work_dir)
     # Use the template yaml to get the correct model name in work_dir yaml
     # Use the template yaml to get the correct model name in work_dir yaml
     base_name = (
     base_name = (
@@ -227,7 +239,9 @@ def prepare_datasets(args):
     if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
     if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
         get_ifeval_data(model_name, args.work_dir)
         get_ifeval_data(model_name, args.work_dir)
         get_math_hard_data(model_name, args.work_dir)
         get_math_hard_data(model_name, args.work_dir)
-    elif "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
+    elif (
+        "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
+    ):
         get_math_data(model_name, args.work_dir)
         get_math_data(model_name, args.work_dir)
     else:
     else:
         if "meta_ifeval" in task_list:
         if "meta_ifeval" in task_list:
@@ -264,10 +278,10 @@ if __name__ == "__main__":
     if not os.path.exists(args.template_dir):
     if not os.path.exists(args.template_dir):
         raise ValueError("The template_dir does not exist, please check the path")
         raise ValueError("The template_dir does not exist, please check the path")
     if args.evals_dataset not in (
     if args.evals_dataset not in (
-        LLAMA_3_1_INSTRUCT_EVALS +
-        LLAMA_3_1_PRETRAIN_EVALS +
-        LLAMA_3_2_INSTRUCT_EVALS +
-        LLAMA_3_2_PRETRAIN_EVALS
+        LLAMA_3_1_INSTRUCT_EVALS
+        + LLAMA_3_1_PRETRAIN_EVALS
+        + LLAMA_3_2_INSTRUCT_EVALS
+        + LLAMA_3_2_PRETRAIN_EVALS
     ):
     ):
         raise ValueError(
         raise ValueError(
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."