11 months ago · ab1b1450d7
--- a/tools/benchmarks/llm_eval_harness/meta_eval/README.md
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/README.md
@@ -46,11 +46,11 @@ It is recommended to read the dataset card to understand the meaning of each col
 
				 Given the extensive number of tasks available (12 for pretrained models and 30 for instruct models), a subset of tasks are chosen:
			
 
				 
			
 
				 - **Tasks for 3.1 pretrained models**: BBH and MMLU-Pro
			
 
				-  - Chosen as they overlap with the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
			
 
				-- **Tasks for 3.2 pretrained models**: MMLU
			
 
				-  - Chosen because MMLU is a common eval, and is the first one shown on on [llama.com](https://llama.com)
			
 
				 - **Tasks for 3.1 instruct models**: Math-Hard, IFeval, GPQA, and MMLU-Pro
			
 
				-  - Chosen as they overlap with the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
			
 
				+- **Tasks for 3.2 pretrained models**: MMLU
			
 
				+- **Tasks for 3.2 instruct models**: MMLU, GPQA
			
 
				+
			
 
				+These tasks are common evalutions, many of which overlap with the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
			
 
				 
			
 
				 Here, we aim to get the benchmark numbers on the aforementioned tasks using Hugging Face [leaderboard implementation](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard). Please follow the instructions below to make necessary modifications to use our eval prompts and get more eval metrics.
			
 
				 
			
@@ -63,10 +63,12 @@ Here, we aim to get the benchmark numbers on the aforementioned tasks using Hugg
 
				 model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				 evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				-# Must be one of the following # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals"]
			
 
				+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
			
 
				 
			
 
				-tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				-# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+# Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+# Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
			
 
				+# Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
			
 
				 
			
 
				 tensor_parallel_size: 1 # The VLLM argument that specify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				 
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,10 +1,11 @@
 
				 model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				 evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
			
 
				-# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals"]
			
 
				+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
			
 
				 
			
 
				-tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+# Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
			
 
				 
			
 
				 tensor_parallel_size: 1 # The VLLM argument that specify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa/gpqa_0shot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa/gpqa_0shot.yaml
@@ -0,0 +1,29 @@
 
				+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-8B-Instruct-evals__gpqa__details
			
 
				+task: meta_gpqa
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: ' ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 2048
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa/utils.py
@@ -0,0 +1,19 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
@@ -1,6 +1,6 @@
 
				 dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
			
 
				 dataset_name: Llama-3.1-8B-Instruct-evals__gpqa__details
			
 
				-task: meta_gpqa
			
 
				+task: meta_gpqa_cot
			
 
				 output_type: generate_until
			
 
				 process_docs: !function utils.process_docs
			
 
				 test_split: latest
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_4shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_4shot_cot.yaml
@@ -0,0 +1,21 @@
 
				+dataset_path: parquet
			
 
				+dataset_kwargs:
			
 
				+  data_files: ./work_dir/joined_math.parquet
			
 
				+task: meta_math
			
 
				+process_docs: !function utils.process_docs
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+doc_to_text:  !function utils.doc_to_text
			
 
				+process_results: !function utils.process_results
			
 
				+doc_to_target: answer
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_hard_0shot_cot.yaml
@@ -1,6 +1,6 @@
 
				 dataset_path: parquet
			
 
				 dataset_kwargs:
			
 
				-  data_files: ./work_dir/joined_math.parquet
			
 
				+  data_files: ./work_dir/joined_math_hard.parquet
			
 
				 task: meta_math_hard
			
 
				 process_docs: !function utils.process_docs
			
 
				 output_type: generate_until
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval/prepare_meta_eval.py
@@ -11,6 +11,24 @@ import nltk
 
				 import yaml
			
 
				 from datasets import Dataset, load_dataset
			
 
				 
			
 
				+LLAMA_3_1_INSTRUCT_EVALS=[
			
 
				+    "meta-llama/Llama-3.1-8B-Instruct-evals",
			
 
				+    "meta-llama/Llama-3.1-70B-Instruct-evals",
			
 
				+    "meta-llama/Llama-3.1-405B-Instruct-evals",
			
 
				+]
			
 
				+LLAMA_3_1_PRETRAIN_EVALS=[
			
 
				+    "meta-llama/Llama-3.1-8B-evals",
			
 
				+    "meta-llama/Llama-3.1-70B-evals",
			
 
				+    "meta-llama/Llama-3.1-405B-evals",
			
 
				+]
			
 
				+LLAMA_3_2_INSTRUCT_EVALS=[
			
 
				+    "meta-llama/Llama-3.2-1B-Instruct-evals",
			
 
				+    "meta-llama/Llama-3.2-3B-Instruct-evals",
			
 
				+]
			
 
				+LLAMA_3_2_PRETRAIN_EVALS=[
			
 
				+    "meta-llama/Llama-3.2-1B-evals",
			
 
				+    "meta-llama/Llama-3.2-3B-evals",
			
 
				+]
			
 
				 
			
 
				 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
			
 
				 def get_ifeval_data(model_name, output_dir):
			
@@ -56,8 +74,8 @@ def get_ifeval_data(model_name, output_dir):
 
				 
			
 
				 
			
 
				 # get the math_hard data from the evals dataset and join it with the original math_hard dataset
			
 
				-def get_math_data(model_name, output_dir):
			
 
				-    print(f"preparing the math data using {model_name}'s evals dataset")
			
 
				+def get_math_hard_data(model_name, output_dir):
			
 
				+    print(f"preparing the math hard data using {model_name}'s evals dataset")
			
 
				     if model_name not in [
			
 
				         "Llama-3.1-8B-Instruct",
			
 
				         "Llama-3.1-70B-Instruct",
			
@@ -74,6 +92,30 @@ def get_math_data(model_name, output_dir):
 
				         split="latest",
			
 
				     )
			
 
				     math_data = load_dataset(original_dataset_name, split="test")
			
 
				+    joined = join_meta_and_original_math_data(meta_data, math_data)
			
 
				+    joined.to_parquet(output_dir + "/joined_math_hard.parquet")
			
 
				+
			
 
				+def get_math_data(model_name, output_dir):
			
 
				+    print(f"preparing the math data using {model_name}'s evals dataset")
			
 
				+    if model_name not in [
			
 
				+        "Llama-3.2-1B-Instruct",
			
 
				+        "Llama-3.2-3B-Instruct",
			
 
				+    ]:
			
 
				+        raise ValueError(
			
 
				+            "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
			
 
				+        )
			
 
				+    original_dataset_name = "lighteval/MATH"
			
 
				+    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				+    meta_data = load_dataset(
			
 
				+        meta_dataset_name,
			
 
				+        name=f"{model_name}-evals__math__details",
			
 
				+        split="latest",
			
 
				+    )
			
 
				+    math_data = load_dataset(original_dataset_name, split="test")
			
 
				+    joined = join_meta_and_original_math_data(meta_data, math_data)
			
 
				+    joined.to_parquet(output_dir + "/joined_math.parquet")
			
 
				+
			
 
				+def join_meta_and_original_math_data(meta_data, math_data):
			
 
				     meta_df = meta_data.to_pandas()
			
 
				     math_df = math_data.to_pandas()
			
 
				     math_df = math_df.rename(columns={"problem": "input_question"})
			
@@ -94,9 +136,7 @@ def get_math_data(model_name, output_dir):
 
				     joined = joined.rename_column(
			
 
				         "output_prediction_text", "previous_output_prediction_text"
			
 
				     )
			
 
				-
			
 
				-    joined.to_parquet(output_dir + "/joined_math.parquet")
			
 
				-
			
 
				+    return joined
			
 
				 
			
 
				 # get the question from the ifeval dataset
			
 
				 def get_question(example):
			
@@ -134,39 +174,33 @@ def change_yaml(args, base_name):
 
				                         "WORK_DIR", str(yaml_dir)
			
 
				                     )
			
 
				                 )
			
 
				-    # 3.2 evals dataset has a differents set of evals from 3.1
			
 
				-    # so update tasks in meta_pretrain.yaml (3.2 for meta_instruct.yaml not supported yet)
			
 
				+    # 3.2 evals dataset has a differents set of tasks from 3.1
			
 
				+    # Update tasks in meta_pretrain.yaml
			
 
				     with open(args.template_dir + "/meta_pretrain.yaml", "r") as yaml_file:
			
 
				         meta_pretrain = yaml.safe_load(yaml_file)
			
 
				-
			
 
				-    if args.evals_dataset in [
			
 
				-        "meta-llama/Llama-3.2-1B-evals",
			
 
				-        "meta-llama/Llama-3.2-3B-evals",
			
 
				-    ]:
			
 
				-        meta_pretrain["task"] = ["meta_mmlu"]
			
 
				-    elif args.evals_dataset in [
			
 
				-        "meta-llama/Llama-3.1-8B-evals",
			
 
				-        "meta-llama/Llama-3.1-70B-evals",
			
 
				-        "meta-llama/Llama-3.1-405B-evals",
			
 
				-    ]:
			
 
				+    if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
			
 
				         meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
			
 
				-
			
 
				+    elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
			
 
				+        meta_pretrain["task"] = ["meta_mmlu"]
			
 
				     with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
			
 
				         yaml.dump(meta_pretrain, yaml_file)
			
 
				-
			
 
				-
			
 
				+    
			
 
				+    # Update tasks in meta_instruct.yaml
			
 
				+    with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
			
 
				+        meta_instruct = yaml.safe_load(yaml_file)
			
 
				+    if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
			
 
				+        meta_instruct["task"] = ["meta_ifeval", "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct"]
			
 
				+    elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
			
 
				+        meta_instruct["task"] = ["meta_mmlu", "meta_math", "meta_gpqa"]
			
 
				+    with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
			
 
				+        yaml.dump(meta_instruct, yaml_file)
			
 
				 
			
 
				 
			
 
				 # copy the files and change the yaml file to use the correct model name
			
 
				 def copy_and_prepare(args):
			
 
				     # nltk punkt_tab package is needed
			
 
				     nltk.download('punkt_tab')
			
 
				-    if not os.path.exists(args.work_dir):
			
 
				-        # Copy the all files, including yaml files and python files, from template folder to the work folder
			
 
				-
			
 
				-        copy_dir(args.template_dir, args.work_dir)
			
 
				-    else:
			
 
				-        print("work_dir already exists, no need to copy files")
			
 
				+    copy_dir(args.template_dir, args.work_dir)
			
 
				     # Use the template yaml to get the correct model name in work_dir yaml
			
 
				     base_name = (
			
 
				         args.evals_dataset.split("/")[-1].replace("-evals", "").replace("-Instruct", "")
			
@@ -190,21 +224,22 @@ def prepare_datasets(args):
 
				     # model_name are derived from the evals_dataset name
			
 
				     task_list = args.tasks.split(",")
			
 
				     model_name = args.evals_dataset.split("/")[-1].replace("-evals", "")
			
 
				-    if "meta_instruct" in task_list:
			
 
				+    if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
			
 
				         get_ifeval_data(model_name, args.work_dir)
			
 
				-
			
 
				+        get_math_hard_data(model_name, args.work_dir)
			
 
				+    elif "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
			
 
				         get_math_data(model_name, args.work_dir)
			
 
				     else:
			
 
				         if "meta_ifeval" in task_list:
			
 
				             get_ifeval_data(model_name, args.work_dir)
			
 
				         if "meta_math_hard" in task_list:
			
 
				-            get_math_data(model_name, args.work_dir)
			
 
				+            get_math_hard_data(model_name, args.work_dir)
			
 
				 
			
 
				 
			
 
				 # copy the files from src to dst
			
 
				 def copy_dir(src, dst):
			
 
				     try:
			
 
				-        shutil.copytree(src, dst)
			
 
				+        shutil.copytree(src, dst, dirs_exist_ok=True)
			
 
				     except OSError as exc:  # python >2.5
			
 
				         if exc.errno in (errno.ENOTDIR, errno.EINVAL):
			
 
				             shutil.copy(src, dst)
			
@@ -228,18 +263,14 @@ if __name__ == "__main__":
 
				         args.__setattr__(k, v)
			
 
				     if not os.path.exists(args.template_dir):
			
 
				         raise ValueError("The template_dir does not exist, please check the path")
			
 
				-    if args.evals_dataset not in [
			
 
				-        "meta-llama/Llama-3.1-8B-Instruct-evals",
			
 
				-        "meta-llama/Llama-3.1-70B-Instruct-evals",
			
 
				-        "meta-llama/Llama-3.1-405B-Instruct-evals",
			
 
				-        "meta-llama/Llama-3.1-8B-evals",
			
 
				-        "meta-llama/Llama-3.1-70B-evals",
			
 
				-        "meta-llama/Llama-3.1-405B-evals",
			
 
				-        "meta-llama/Llama-3.2-1B-evals",
			
 
				-        "meta-llama/Llama-3.2-3B-evals",
			
 
				-    ]:
			
 
				+    if args.evals_dataset not in (
			
 
				+        LLAMA_3_1_INSTRUCT_EVALS +
			
 
				+        LLAMA_3_1_PRETRAIN_EVALS +
			
 
				+        LLAMA_3_2_INSTRUCT_EVALS +
			
 
				+        LLAMA_3_2_PRETRAIN_EVALS
			
 
				+    ):
			
 
				         raise ValueError(
			
 
				-            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection. Note that 3.2-Instruct evals are not yet supported."
			
 
				+            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
			
 
				         )
			
 
				     args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				     # Copy the all files from template folder to the work folder