1 سال پیش · e1b7bc728c
--- a/tools/benchmarks/llm_eval_harness/meta_eval/README.md
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,7 +1,7 @@
 
				-model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				+model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				-evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				-# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
			
 
				+evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals"]
			
 
				 
			
 
				 tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/bbh/bbh_3shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/bbh/bbh_3shot_cot.yaml
@@ -1,5 +1,5 @@
 
				-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
			
 
				-dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
			
 
				+dataset_path: meta-llama/Llama-3.1-8B-evals
			
 
				+dataset_name: Llama-3.1-8B-evals__bbh__details
			
 
				 task: meta_bbh
			
 
				 output_type: generate_until
			
 
				 process_docs: !function utils.process_docs
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
@@ -1,5 +1,5 @@
 
				-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
			
 
				-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
			
 
				+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-8B-Instruct-evals__gpqa__details
			
 
				 task: meta_gpqa
			
 
				 output_type: generate_until
			
 
				 process_docs: !function utils.process_docs
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -1,6 +1,6 @@
 
				 task: meta_mmlu_pro_instruct
			
 
				-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
			
 
				-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
			
 
				+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
			
 
				 test_split: latest
			
 
				 output_type: generate_until
			
 
				 process_docs: !function utils.process_docs
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -1,6 +1,6 @@
 
				 task: meta_mmlu_pro_pretrain
			
 
				-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
			
 
				-dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
			
 
				+dataset_path: meta-llama/Llama-3.1-8B-evals
			
 
				+dataset_name: Llama-3.1-8B-evals__mmlu_pro__details
			
 
				 test_split: latest
			
 
				 output_type: generate_until
			
 
				 process_docs: !function utils.process_docs
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
@@ -16,12 +16,12 @@ from datasets import Dataset, load_dataset
 
				 def get_ifeval_data(model_name, output_dir):
			
 
				     print(f"preparing the ifeval data using {model_name}'s evals dataset")
			
 
				     if model_name not in [
			
 
				-        "Meta-Llama-3.1-8B-Instruct",
			
 
				-        "Meta-Llama-3.1-70B-Instruct",
			
 
				-        "Meta-Llama-3.1-405B-Instruct",
			
 
				+        "Llama-3.1-8B-Instruct",
			
 
				+        "Llama-3.1-70B-Instruct",
			
 
				+        "Llama-3.1-405B-Instruct",
			
 
				     ]:
			
 
				         raise ValueError(
			
 
				-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval"
			
 
				+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
			
 
				         )
			
 
				     original_dataset_name = "wis-k/instruction-following-eval"
			
 
				     meta_dataset_name = f"meta-llama/{model_name}-evals"
			
@@ -59,12 +59,12 @@ def get_ifeval_data(model_name, output_dir):
 
				 def get_math_data(model_name, output_dir):
			
 
				     print(f"preparing the math data using {model_name}'s evals dataset")
			
 
				     if model_name not in [
			
 
				-        "Meta-Llama-3.1-8B-Instruct",
			
 
				-        "Meta-Llama-3.1-70B-Instruct",
			
 
				-        "Meta-Llama-3.1-405B-Instruct",
			
 
				+        "Llama-3.1-8B-Instruct",
			
 
				+        "Llama-3.1-70B-Instruct",
			
 
				+        "Llama-3.1-405B-Instruct",
			
 
				     ]:
			
 
				         raise ValueError(
			
 
				-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard"
			
 
				+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
			
 
				         )
			
 
				     original_dataset_name = "lighteval/MATH-Hard"
			
 
				     meta_dataset_name = f"meta-llama/{model_name}-evals"
			
@@ -130,7 +130,7 @@ def change_yaml(args, base_name):
 
				         with open(output_path, "w") as output:
			
 
				             for line in lines:
			
 
				                 output.write(
			
 
				-                    line.replace("Meta-Llama-3.1-8B", base_name).replace(
			
 
				+                    line.replace("Llama-3.1-8B", base_name).replace(
			
 
				                         "WORK_DIR", str(yaml_dir)
			
 
				                     )
			
 
				                 )
			
@@ -208,12 +208,12 @@ if __name__ == "__main__":
 
				     if not os.path.exists(args.template_dir):
			
 
				         raise ValueError("The template_dir does not exist, please check the path")
			
 
				     if args.evals_dataset not in [
			
 
				-        "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
			
 
				-        "meta-llama/Meta-Llama-3.1-70B-Instruct-evals",
			
 
				-        "meta-llama/Meta-Llama-3.1-405B-Instruct-evals",
			
 
				-        "meta-llama/Meta-Llama-3.1-8B-evals",
			
 
				-        "meta-llama/Meta-Llama-3.1-70B-evals",
			
 
				-        "meta-llama/Meta-Llama-3.1-405B-evals",
			
 
				+        "meta-llama/Llama-3.1-8B-Instruct-evals",
			
 
				+        "meta-llama/Llama-3.1-70B-Instruct-evals",
			
 
				+        "meta-llama/Llama-3.1-405B-Instruct-evals",
			
 
				+        "meta-llama/Llama-3.1-8B-evals",
			
 
				+        "meta-llama/Llama-3.1-70B-evals",
			
 
				+        "meta-llama/Llama-3.1-405B-evals",
			
 
				     ]:
			
 
				         raise ValueError(
			
 
				             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"