Browse Source

remove result section and change meta-llama 3.1 to llama 3.1

Kai Wu 6 months ago
parent
commit
e1b7bc728c

File diff suppressed because it is too large
+ 12 - 32
tools/benchmarks/llm_eval_harness/meta_eval/README.md


+ 3 - 3
tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml

@@ -1,7 +1,7 @@
-model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
 
-evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
-# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals"]
 
 
 tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval/meta_template/bbh/bbh_3shot_cot.yaml

@@ -1,5 +1,5 @@
-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
-dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
+dataset_path: meta-llama/Llama-3.1-8B-evals
+dataset_name: Llama-3.1-8B-evals__bbh__details
 task: meta_bbh
 task: meta_bbh
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/gpqa_0shot_cot.yaml

@@ -1,5 +1,5 @@
-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__gpqa__details
 task: meta_gpqa
 task: meta_gpqa
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml

@@ -1,6 +1,6 @@
 task: meta_mmlu_pro_instruct
 task: meta_mmlu_pro_instruct
-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
 test_split: latest
 test_split: latest
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml

@@ -1,6 +1,6 @@
 task: meta_mmlu_pro_pretrain
 task: meta_mmlu_pro_pretrain
-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
-dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
+dataset_path: meta-llama/Llama-3.1-8B-evals
+dataset_name: Llama-3.1-8B-evals__mmlu_pro__details
 test_split: latest
 test_split: latest
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

+ 15 - 15
tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py

@@ -16,12 +16,12 @@ from datasets import Dataset, load_dataset
 def get_ifeval_data(model_name, output_dir):
 def get_ifeval_data(model_name, output_dir):
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
-        "Meta-Llama-3.1-8B-Instruct",
-        "Meta-Llama-3.1-70B-Instruct",
-        "Meta-Llama-3.1-405B-Instruct",
+        "Llama-3.1-8B-Instruct",
+        "Llama-3.1-70B-Instruct",
+        "Llama-3.1-405B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
         )
         )
     original_dataset_name = "wis-k/instruction-following-eval"
     original_dataset_name = "wis-k/instruction-following-eval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -59,12 +59,12 @@ def get_ifeval_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
     print(f"preparing the math data using {model_name}'s evals dataset")
     print(f"preparing the math data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
-        "Meta-Llama-3.1-8B-Instruct",
-        "Meta-Llama-3.1-70B-Instruct",
-        "Meta-Llama-3.1-405B-Instruct",
+        "Llama-3.1-8B-Instruct",
+        "Llama-3.1-70B-Instruct",
+        "Llama-3.1-405B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
         )
         )
     original_dataset_name = "lighteval/MATH-Hard"
     original_dataset_name = "lighteval/MATH-Hard"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -130,7 +130,7 @@ def change_yaml(args, base_name):
         with open(output_path, "w") as output:
         with open(output_path, "w") as output:
             for line in lines:
             for line in lines:
                 output.write(
                 output.write(
-                    line.replace("Meta-Llama-3.1-8B", base_name).replace(
+                    line.replace("Llama-3.1-8B", base_name).replace(
                         "WORK_DIR", str(yaml_dir)
                         "WORK_DIR", str(yaml_dir)
                     )
                     )
                 )
                 )
@@ -208,12 +208,12 @@ if __name__ == "__main__":
     if not os.path.exists(args.template_dir):
     if not os.path.exists(args.template_dir):
         raise ValueError("The template_dir does not exist, please check the path")
         raise ValueError("The template_dir does not exist, please check the path")
     if args.evals_dataset not in [
     if args.evals_dataset not in [
-        "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-70B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-405B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-8B-evals",
-        "meta-llama/Meta-Llama-3.1-70B-evals",
-        "meta-llama/Meta-Llama-3.1-405B-evals",
+        "meta-llama/Llama-3.1-8B-Instruct-evals",
+        "meta-llama/Llama-3.1-70B-Instruct-evals",
+        "meta-llama/Llama-3.1-405B-Instruct-evals",
+        "meta-llama/Llama-3.1-8B-evals",
+        "meta-llama/Llama-3.1-70B-evals",
+        "meta-llama/Llama-3.1-405B-evals",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"