il y a 1 an · 9f0acebe02
--- a/tools/benchmarks/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/meta_eval_reproduce/README.md
@@ -0,0 +1 @@
 
				+python meta_eval.py
			
--- a/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
@@ -0,0 +1,32 @@
 
				+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				+
			
 
				+evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				+# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
			
 
				+
			
 
				+tasks: "meta_math_hard" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				+
			
 
				+tensor_parallel_size: 2 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				+
			
 
				+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				+
			
 
				+gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
			
 
				+
			
 
				+max_model_len: 8192 #The VLLM argument that speicify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
			
 
				+
			
 
				+batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
			
 
				+
			
 
				+output_path: "eval_results" # the output folder to store all the eval results and samples.
			
 
				+
			
 
				+limit: 16 # Limit number of examples per task, set 'null' to run all.
			
 
				+#limit: null # Limit number of examples per task.
			
 
				+
			
 
				+verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
			
 
				+
			
 
				+log_samples: true # If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.
			
 
				+
			
 
				+work_dir: ./work_dir # The work folder where the task template yaml files will be copied and modified, datasets will be downloaded for math_hard, ifeval.
			
 
				+
			
 
				+template_dir: ./meta_template #Path to the folder that contains all the meta task templates
			
 
				+
			
 
				+show_config: false # If True, shows the full config of all tasks at the end of the evaluation.
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_eval.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_eval.py
@@ -0,0 +1,194 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+import glob
			
 
				+import numpy as np
			
 
				+import lm_eval
			
 
				+from lm_eval import tasks
			
 
				+from lm_eval.utils import make_table
			
 
				+from prepare_dataset import get_ifeval_data, get_math_data
			
 
				+import shutil, errno
			
 
				+import yaml
			
 
				+from datetime import datetime
			
 
				+
			
 
				+def _handle_non_serializable(o):
			
 
				+    if isinstance(o, np.int64) or isinstance(o, np.int32):
			
 
				+        return int(o)
			
 
				+    elif isinstance(o, set):
			
 
				+        return list(o)
			
 
				+    else:
			
 
				+        return str(o)
			
 
				+
			
 
				+
			
 
				+def setup_logging(verbosity):
			
 
				+    logging.basicConfig(
			
 
				+        level=verbosity.upper(), format="%(asctime)s - %(levelname)s - %(message)s"
			
 
				+    )
			
 
				+    return logging.getLogger(__name__)
			
 
				+
			
 
				+def change_yaml(args, base_name):
			
 
				+    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
			
 
				+        with open(yaml_file, "r") as sources:
			
 
				+            lines = sources.readlines()
			
 
				+        output_path = yaml_file.replace(args.template_dir,args.work_dir)
			
 
				+        print(f"changing {yaml_file} to output_path: {output_path}")
			
 
				+        path = Path(output_path)
			
 
				+        yaml_dir = path.parent
			
 
				+        with open(output_path, "w") as output:
			
 
				+            for line in lines:
			
 
				+                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
			
 
				+def handle_output(args, results, logger):
			
 
				+    if not results:
			
 
				+        logger.error("No results found.")
			
 
				+        sys.exit(1)
			
 
				+    if not args.output_path:
			
 
				+        if args.log_samples:
			
 
				+            logger.error("Specify --output_path for logging samples.")
			
 
				+            sys.exit(1)
			
 
				+        return
			
 
				+
			
 
				+    if args.log_samples:
			
 
				+        samples = results.pop("samples")
			
 
				+    results_str = json.dumps(
			
 
				+        results, indent=2, default=_handle_non_serializable, ensure_ascii=False
			
 
				+    )
			
 
				+    if args.show_config:
			
 
				+        logger.info(results_str)
			
 
				+    date_id = datetime.now().isoformat().replace(":", "-")
			
 
				+    path = Path(args.output_path)
			
 
				+
			
 
				+
			
 
				+    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
			
 
				+    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+    file_path = os.path.join(args.output_path, "eval_results_" + date_id + ".json")
			
 
				+    with open(file_path , "w", encoding="utf-8") as f:
			
 
				+        f.write(results_str)
			
 
				+
			
 
				+    if args.log_samples:
			
 
				+        for task_name, _ in results.get("configs", {}).items():
			
 
				+            output_name = task_name + "_"+ date_id + re.sub(r"/|=", "_", args.model_args.split(",")[0].replace("pretrained",""))
			
 
				+            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
			
 
				+            sample_data = json.dumps(
			
 
				+                samples.get(task_name, {}), indent=2, default=_handle_non_serializable
			
 
				+            )
			
 
				+            sample_file.write_text(sample_data, encoding="utf-8")
			
 
				+
			
 
				+    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
			
 
				+    summary = f"{args.model_name} ({args.model_args})"
			
 
				+    logger.info(summary)
			
 
				+    logger.info(make_table(results))
			
 
				+    if "groups" in results:
			
 
				+        logger.info(make_table(results, "groups"))
			
 
				+
			
 
				+
			
 
				+def load_tasks(args):
			
 
				+    if not args.tasks or "meta" not in args.tasks:
			
 
				+        raise ValueError("Please specify a valid meta task name")
			
 
				+    if args.tasks:
			
 
				+        tasks_list = args.tasks.split(",") 
			
 
				+    else:
			
 
				+        print("No tasks specified. Please try again")
			
 
				+        sys.exit(1)
			
 
				+    current_dir = os.getcwd()
			
 
				+    config_dir = os.path.join(current_dir, args.work_dir)
			
 
				+    print(f"Including the config_dir to task manager: {config_dir}")
			
 
				+    task_manager = tasks.TaskManager(include_path=config_dir)
			
 
				+    return task_manager, tasks_list
			
 
				+
			
 
				+def copy_and_prepare(args):
			
 
				+    if not os.path.exists(args.work_dir):
			
 
				+        # Copy the all files, including yaml files and python files, from template folder to the work folder
			
 
				+
			
 
				+        copy_dir(args.template_dir,args.work_dir)
			
 
				+    else:
			
 
				+        print("work_dir already exists, no need to copy files")
			
 
				+    # Use the template yaml to get the correct model name in work_dir yaml
			
 
				+    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
			
 
				+    change_yaml(args, base_name)
			
 
				+
			
 
				+def parse_eval_args():
			
 
				+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				+    parser.add_argument(
			
 
				+        "--config_path",
			
 
				+        type=str,
			
 
				+        default="./eval_config.yaml",
			
 
				+        help="the config yaml file that contains all the eval parameters",
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+def prepare_datasets(task_list,args):
			
 
				+    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
			
 
				+    # model_name are derived from the evals_dataset name
			
 
				+    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
			
 
				+    if "meta_instruct" in task_list:
			
 
				+        get_ifeval_data(model_name,args.work_dir)
			
 
				+        
			
 
				+        get_math_data(model_name,args.work_dir)
			
 
				+    else:
			
 
				+        if "meta_ifeval" in task_list:
			
 
				+            get_ifeval_data(model_name,args.work_dir)
			
 
				+        if "meta_math_hard" in task_list:
			
 
				+            get_math_data(model_name,args.work_dir)
			
 
				+    
			
 
				+def evaluate_model(args):
			
 
				+    # Customized model such as Quantized model etc.
			
 
				+    # In case you are working with a custom model, you can use the following guide to add it here:
			
 
				+    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
			
 
				+    task_manager, task_list = load_tasks(args)
			
 
				+    logger.info(f"Loaded tasks: {task_list}")
			
 
				+    # We need to prepare the dataset for the IFeval and MATH_Hard tasks
			
 
				+    if "meta_instruct" in task_list or "meta_ifeval" in task_list or "meta_math_hard" in task_list:
			
 
				+        prepare_datasets(task_list, args)
			
 
				+    # Evaluate
			
 
				+    results = lm_eval.simple_evaluate(
			
 
				+        model="vllm",
			
 
				+        model_args=args.model_args,
			
 
				+        tasks=task_list,
			
 
				+        limit=args.limit,
			
 
				+        log_samples=args.log_samples,
			
 
				+        task_manager=task_manager,
			
 
				+        random_seed=42,
			
 
				+        numpy_random_seed=42,
			
 
				+        torch_random_seed=42,
			
 
				+        fewshot_random_seed=42
			
 
				+        )
			
 
				+    handle_output(args, results, logger)
			
 
				+
			
 
				+
			
 
				+def copy_dir(src, dst):
			
 
				+    try:
			
 
				+        shutil.copytree(src, dst)
			
 
				+    except OSError as exc: # python >2.5
			
 
				+        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
			
 
				+            shutil.copy(src, dst)
			
 
				+        else: raise
			
 
				+def load_config(config_path: str = "./config.yaml"):
			
 
				+    # Read the YAML configuration file
			
 
				+    with open(config_path, "r") as file:
			
 
				+        config = yaml.safe_load(file)
			
 
				+    return config
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    args = parse_eval_args()
			
 
				+    config = load_config(args.config_path)
			
 
				+    # Create VLLM model args
			
 
				+    for k,v in config.items():
			
 
				+        args.__setattr__(k,v)
			
 
				+    if not os.path.exists(args.template_dir):
			
 
				+        raise ValueError("The template_dir does not exist, please check the path")
			
 
				+    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
			
 
				+        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
			
 
				+    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				+    # Copy the all files from template folder to the work folder
			
 
				+    copy_and_prepare(args)
			
 
				+    logger = setup_logging(args.verbosity)
			
 
				+    evaluate_model(args)
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
@@ -0,0 +1,28 @@
 
				+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
			
 
				+dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
			
 
				+task: meta_bbh
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: answer
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        regex_pattern: 'the answer is (.*?)\.'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: "\n\nQ: "
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/bbh/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/bbh/utils.py
@@ -0,0 +1,21 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "answer": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
@@ -0,0 +1,29 @@
 
				+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
			
 
				+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
			
 
				+task: meta_gpqa
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: 'best answer is ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 2048
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
@@ -0,0 +1,21 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+            "choices": list(doc["input_choice_list"])
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
 
				+task: meta_ifeval
			
 
				+dataset_path: json
			
 
				+dataset_kwargs:
			
 
				+  data_files: ../joined_ifeval.json
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+num_fewshot: 0
			
 
				+doc_to_text: prompt
			
 
				+doc_to_target: 0
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0.0
			
 
				+  max_gen_toks: 1280
			
 
				+process_results: !function utils.process_results
			
 
				+metric_list:
			
 
				+  - metric: prompt_level_strict_acc
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+  - metric: inst_level_strict_acc
			
 
				+    aggregation: !function utils.agg_inst_level_acc
			
 
				+    higher_is_better: true
			
 
				+  - metric: prompt_level_loose_acc
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+  - metric: inst_level_loose_acc
			
 
				+    aggregation: !function utils.agg_inst_level_acc
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 2.0
			
 
				+fewshot_config:
			
 
				+  sampler: first_n
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/utils.py
@@ -0,0 +1,139 @@
 
				+import dataclasses
			
 
				+from typing import Dict, Optional, Union
			
 
				+
			
 
				+from lm_eval.tasks.ifeval import instructions_registry
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class InputExample:
			
 
				+    key: int
			
 
				+    instruction_id_list: list[str]
			
 
				+    prompt: str
			
 
				+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class OutputExample:
			
 
				+    instruction_id_list: list[str]
			
 
				+    prompt: str
			
 
				+    response: str
			
 
				+    follow_all_instructions: bool
			
 
				+    follow_instruction_list: list[bool]
			
 
				+
			
 
				+
			
 
				+def test_instruction_following_strict(
			
 
				+    inp,
			
 
				+    response,
			
 
				+):
			
 
				+    """Tests response to see if instructions are followed."""
			
 
				+    instruction_list = inp.instruction_id_list
			
 
				+    is_following_list = []
			
 
				+
			
 
				+    for index, instruction_id in enumerate(instruction_list):
			
 
				+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
			
 
				+        instruction = instruction_cls(instruction_id)
			
 
				+                
			
 
				+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
			
 
				+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
			
 
				+        instruction.build_description(**kwargs)
			
 
				+        args = instruction.get_instruction_args()
			
 
				+        if args and "prompt" in args:
			
 
				+            instruction.build_description(prompt=inp.prompt)
			
 
				+
			
 
				+        if response.strip() and instruction.check_following(response):
			
 
				+            is_following_list.append(True)
			
 
				+        else:
			
 
				+            is_following_list.append(False)
			
 
				+
			
 
				+    return OutputExample(
			
 
				+        instruction_id_list=inp.instruction_id_list,
			
 
				+        prompt=inp.prompt,
			
 
				+        response=response,
			
 
				+        follow_all_instructions=all(is_following_list),
			
 
				+        follow_instruction_list=is_following_list,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def test_instruction_following_loose(
			
 
				+    inp,
			
 
				+    response,
			
 
				+):
			
 
				+    """Tests response for an upper bound for following instructions."""
			
 
				+    r = response.split("\n")
			
 
				+    response_remove_first = "\n".join(r[1:]).strip()
			
 
				+    response_remove_last = "\n".join(r[:-1]).strip()
			
 
				+    response_remove_both = "\n".join(r[1:-1]).strip()
			
 
				+    revised_response = response.replace("*", "")
			
 
				+    revised_response_remove_first = response_remove_first.replace("*", "")
			
 
				+    revised_response_remove_last = response_remove_last.replace("*", "")
			
 
				+    revised_response_remove_both = response_remove_both.replace("*", "")
			
 
				+    all_responses = [
			
 
				+        response,
			
 
				+        revised_response,
			
 
				+        response_remove_first,
			
 
				+        response_remove_last,
			
 
				+        response_remove_both,
			
 
				+        revised_response_remove_first,
			
 
				+        revised_response_remove_last,
			
 
				+        revised_response_remove_both,
			
 
				+    ]
			
 
				+    instruction_list = inp.instruction_id_list
			
 
				+    is_following_list = []
			
 
				+
			
 
				+    for index, instruction_id in enumerate(instruction_list):
			
 
				+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
			
 
				+        instruction = instruction_cls(instruction_id)
			
 
				+
			
 
				+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
			
 
				+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
			
 
				+        instruction.build_description(**kwargs)
			
 
				+        args = instruction.get_instruction_args()
			
 
				+        if args and "prompt" in args:
			
 
				+            instruction.build_description(prompt=inp.prompt)
			
 
				+
			
 
				+        is_following = False
			
 
				+        for r in all_responses:
			
 
				+            if r.strip() and instruction.check_following(r):
			
 
				+                is_following = True
			
 
				+                break
			
 
				+
			
 
				+        is_following_list.append(is_following)
			
 
				+
			
 
				+    return OutputExample(
			
 
				+        instruction_id_list=inp.instruction_id_list,
			
 
				+        prompt=inp.prompt,
			
 
				+        response=response,
			
 
				+        follow_all_instructions=all(is_following_list),
			
 
				+        follow_instruction_list=is_following_list,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def process_results(doc, results):
			
 
				+    new_kwargs = []
			
 
				+    for item in doc["kwargs"]:
			
 
				+        if item["nth_paragraph"]:
			
 
				+            item["nth_paragraph"] = int(item["nth_paragraph"])
			
 
				+        new_kwargs.append(item)
			
 
				+    inp = InputExample(
			
 
				+        key=doc["key"],
			
 
				+        instruction_id_list=doc["instruction_id_list"],
			
 
				+        prompt=doc["prompt"],
			
 
				+        kwargs=new_kwargs,
			
 
				+    )
			
 
				+    response = results[0]
			
 
				+
			
 
				+    out_strict = test_instruction_following_strict(inp, response)
			
 
				+    out_loose = test_instruction_following_loose(inp, response)
			
 
				+
			
 
				+    return {
			
 
				+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
			
 
				+        "inst_level_strict_acc": out_strict.follow_instruction_list,
			
 
				+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
			
 
				+        "inst_level_loose_acc": out_loose.follow_instruction_list,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def agg_inst_level_acc(items):
			
 
				+    flat_items = [item for sublist in items for item in sublist]
			
 
				+    inst_level_acc = sum(flat_items) / len(flat_items)
			
 
				+    return inst_level_acc
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
@@ -0,0 +1,46 @@
 
				+from datasets import load_dataset,Dataset
			
 
				+import os
			
 
				+import yaml
			
 
				+# def check_sample(example):
			
 
				+#     if "kwargs" in example and not example["kwargs"]:
			
 
				+#         print(example)
			
 
				+#         raise ValueError("This example did not got ds for IFeval")
			
 
				+#     if "solution" in example and not example["solution"]:
			
 
				+#         print(example)
			
 
				+#         raise ValueError("This example did not got ds for MATH_hard")
			
 
				+def load_config(config_path: str = "./eval_config.yaml"):
			
 
				+    # Read the YAML configuration file
			
 
				+    with open(config_path, "r") as file:
			
 
				+        config = yaml.safe_load(file)
			
 
				+    return config
			
 
				+# current_dir = os.getcwd()
			
 
				+# print("current_dir",current_dir)
			
 
				+# yaml = load_config(str(current_dir)+"/eval_config.yaml")
			
 
				+# meta_dataset_name = yaml["evals_dataset"]
			
 
				+# model_name = meta_dataset_name.split("/")[-1].replace("-evals","")
			
 
				+# original_dataset_name = "lighteval/MATH-Hard"
			
 
				+
			
 
				+# meta_data = load_dataset(
			
 
				+#     meta_dataset_name,
			
 
				+#     name=f"{model_name}-evals__math_hard__details",
			
 
				+#     split="latest"
			
 
				+#     )
			
 
				+# math_data = load_dataset(
			
 
				+#     original_dataset_name,
			
 
				+#     split="test"
			
 
				+#     )
			
 
				+# meta_df = meta_data.to_pandas()
			
 
				+# math_df = math_data.to_pandas()
			
 
				+# math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				+
			
 
				+# joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				+# ds = Dataset.from_pandas(joined)
			
 
				+# ds = ds.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				+# ds = ds.rename_column("is_correct","previous_is_correct")
			
 
				+# ds = ds.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+from datasets import load_dataset
			
 
				+current_dir = os.getcwd()
			
 
				+print("current_dir",current_dir)
			
 
				+yaml = load_config(str(current_dir)+"/eval_config.yaml")
			
 
				+work_dir = yaml["work_dir"]
			
 
				+load_dataset('parquet', data_files=str(current_dir)+"/"+work_dir+"/joined_math.parquet")
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
@@ -0,0 +1,26 @@
 
				+dataset_path: parquet
			
 
				+dataset_name: null
			
 
				+dataset_kwargs:
			
 
				+  data_files:
			
 
				+    train: /home/kaiwu/work/llama-recipes/tools/benchmarks/meta_eval_reproduce/work_dir/joined_math.parquet
			
 
				+  trust_remote_code: true
			
 
				+task: meta_math_hard
			
 
				+process_docs: !function utils.process_docs
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+doc_to_text:  !function utils.doc_to_text
			
 
				+process_results: !function utils.process_results
			
 
				+doc_to_target: answer
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 5120
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
 
				+dataset_kwargs:
			
 
				+  trust_remote_code: true
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
@@ -0,0 +1,303 @@
 
				+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
			
 
				+import re
			
 
				+import signal
			
 
				+from typing import Dict, List, Optional
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+from lm_eval.utils import eval_logger
			
 
				+
			
 
				+
			
 
				+try:
			
 
				+    import sympy
			
 
				+    from sympy.parsing.latex import parse_latex
			
 
				+except ModuleNotFoundError:
			
 
				+    raise ModuleNotFoundError(
			
 
				+        "`sympy` is required for generating translation task prompt templates. \
			
 
				+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
			
 
				+    )
			
 
				+
			
 
				+# taken from
			
 
				+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "answer": normalize_final_answer(
			
 
				+                 remove_boxed(last_boxed_only_string(doc["solution"]))
			
 
				+            ),
			
 
				+            "meta_target": doc["input_correct_responses"]
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    #dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    return dataset.map(_process_doc)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def extract_result_from_boxed(answer: str) -> str:
			
 
				+    box_start = "\\boxed"
			
 
				+    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
			
 
				+    start = answer.rfind(box_start)
			
 
				+    if start < 0:
			
 
				+        return ""
			
 
				+    answer = answer[start + len(box_start) :].strip()
			
 
				+    ends_with_curly = answer.startswith("{")
			
 
				+    i = 0
			
 
				+    open_braces = 0
			
 
				+    while i < len(answer):
			
 
				+        if answer[i] == "{":
			
 
				+            open_braces += 1
			
 
				+        elif answer[i] == "}":
			
 
				+            open_braces -= 1
			
 
				+        if open_braces == 0:
			
 
				+            if ends_with_curly:
			
 
				+                answer = answer[: i + 1].strip()
			
 
				+                break
			
 
				+            elif answer[i] == "$":
			
 
				+                answer = answer[:i].strip()
			
 
				+                break
			
 
				+        i += 1
			
 
				+    else:
			
 
				+        return ""
			
 
				+    # remove extra curly braces
			
 
				+    while True:
			
 
				+        if answer.startswith("{") and answer.endswith("}"):
			
 
				+            answer = answer[1:-1].strip()
			
 
				+        else:
			
 
				+            break
			
 
				+    return answer
			
 
				+
			
 
				+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
			
 
				+    candidates = results[0]
			
 
				+
			
 
				+    unnormalized_answer = get_unnormalized_answer(candidates)
			
 
				+    if unnormalized_answer == "[invalidanswer]":
			
 
				+        unnormalized_answer = extract_result_from_boxed(candidates)
			
 
				+    answer = normalize_final_answer(unnormalized_answer)
			
 
				+
			
 
				+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
			
 
				+        retval = 1
			
 
				+    else:
			
 
				+        retval = 0
			
 
				+
			
 
				+    results = {
			
 
				+        "exact_match": retval,
			
 
				+    }
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def last_boxed_only_string(string: str) -> Optional[str]:
			
 
				+    idx = string.rfind("\\boxed")
			
 
				+    if "\\boxed " in string:
			
 
				+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
			
 
				+    if idx < 0:
			
 
				+        idx = string.rfind("\\fbox")
			
 
				+        if idx < 0:
			
 
				+            return None
			
 
				+
			
 
				+    i = idx
			
 
				+    right_brace_idx = None
			
 
				+    num_left_braces_open = 0
			
 
				+    while i < len(string):
			
 
				+        if string[i] == "{":
			
 
				+            num_left_braces_open += 1
			
 
				+        if string[i] == "}":
			
 
				+            num_left_braces_open -= 1
			
 
				+            if num_left_braces_open == 0:
			
 
				+                right_brace_idx = i
			
 
				+                break
			
 
				+        i += 1
			
 
				+
			
 
				+    if right_brace_idx is None:
			
 
				+        retval = None
			
 
				+    else:
			
 
				+        retval = string[idx : right_brace_idx + 1]
			
 
				+
			
 
				+    return retval
			
 
				+
			
 
				+
			
 
				+def remove_boxed(s: str) -> str:
			
 
				+    if "\\boxed " in s:
			
 
				+        left = "\\boxed "
			
 
				+        assert s[: len(left)] == left
			
 
				+        return s[len(left) :]
			
 
				+
			
 
				+    left = "\\boxed{"
			
 
				+
			
 
				+    assert s[: len(left)] == left
			
 
				+    assert s[-1] == "}"
			
 
				+
			
 
				+    return s[len(left) : -1]
			
 
				+
			
 
				+
			
 
				+class timeout:
			
 
				+    def __init__(self, seconds=1, error_message="Timeout"):
			
 
				+        self.seconds = seconds
			
 
				+        self.error_message = error_message
			
 
				+
			
 
				+    def handle_timeout(self, signum, frame):
			
 
				+        raise TimeoutError(self.error_message)
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        signal.signal(signal.SIGALRM, self.handle_timeout)
			
 
				+        signal.alarm(self.seconds)
			
 
				+
			
 
				+    def __exit__(self, type, value, traceback):
			
 
				+        signal.alarm(0)
			
 
				+
			
 
				+
			
 
				+def is_equiv(x1: str, x2: str) -> bool:
			
 
				+    """
			
 
				+    x1 and x2 are normalized latex string
			
 
				+    """
			
 
				+    try:
			
 
				+        with timeout(seconds=5):
			
 
				+            try:
			
 
				+                parsed_x1 = parse_latex(x1)
			
 
				+                parsed_x2 = parse_latex(x2)
			
 
				+            except (
			
 
				+                sympy.parsing.latex.errors.LaTeXParsingError,
			
 
				+                sympy.SympifyError,
			
 
				+                TypeError,
			
 
				+            ):
			
 
				+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
			
 
				+                return False
			
 
				+
			
 
				+            try:
			
 
				+                diff = parsed_x1 - parsed_x2
			
 
				+            except TypeError:
			
 
				+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
			
 
				+                return False
			
 
				+
			
 
				+            try:
			
 
				+                if sympy.simplify(diff) == 0:
			
 
				+                    return True
			
 
				+                else:
			
 
				+                    return False
			
 
				+            except ValueError:
			
 
				+                eval_logger.debug(
			
 
				+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
			
 
				+                )
			
 
				+    except TimeoutError:
			
 
				+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
			
 
				+        return False
			
 
				+    except ImportError as e:
			
 
				+        eval_logger.error(e)
			
 
				+        raise
			
 
				+    except Exception as e:
			
 
				+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def get_unnormalized_answer(text: str) -> str:
			
 
				+    INVALID_ANSWER = "[invalidanswer]"
			
 
				+    end_seq = "I hope it is correct."
			
 
				+    text += end_seq
			
 
				+    match = re.search(
			
 
				+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
			
 
				+        text,
			
 
				+    )
			
 
				+    if match:
			
 
				+        return match.group(1).strip()
			
 
				+    else:
			
 
				+        return INVALID_ANSWER
			
 
				+
			
 
				+
			
 
				+SUBSTITUTIONS = [
			
 
				+    ("an ", ""),
			
 
				+    ("a ", ""),
			
 
				+    (".$", "$"),
			
 
				+    ("\\$", ""),
			
 
				+    (r"\ ", ""),
			
 
				+    (" ", ""),
			
 
				+    ("mbox", "text"),
			
 
				+    (",\\text{and}", ","),
			
 
				+    ("\\text{and}", ","),
			
 
				+    ("\\text{m}", "\\text{}"),
			
 
				+]
			
 
				+REMOVED_EXPRESSIONS = [
			
 
				+    "square",
			
 
				+    "ways",
			
 
				+    "integers",
			
 
				+    "dollars",
			
 
				+    "mph",
			
 
				+    "inches",
			
 
				+    "ft",
			
 
				+    "hours",
			
 
				+    "km",
			
 
				+    "units",
			
 
				+    "\\ldots",
			
 
				+    "sue",
			
 
				+    "points",
			
 
				+    "feet",
			
 
				+    "minutes",
			
 
				+    "digits",
			
 
				+    "cents",
			
 
				+    "degrees",
			
 
				+    "cm",
			
 
				+    "gm",
			
 
				+    "pounds",
			
 
				+    "meters",
			
 
				+    "meals",
			
 
				+    "edges",
			
 
				+    "students",
			
 
				+    "childrentickets",
			
 
				+    "multiples",
			
 
				+    "\\text{s}",
			
 
				+    "\\text{.}",
			
 
				+    "\\text{\ns}",
			
 
				+    "\\text{}^2",
			
 
				+    "\\text{}^3",
			
 
				+    "\\text{\n}",
			
 
				+    "\\text{}",
			
 
				+    r"\mathrm{th}",
			
 
				+    r"^\circ",
			
 
				+    r"^{\circ}",
			
 
				+    r"\;",
			
 
				+    r",\!",
			
 
				+    "{,}",
			
 
				+    '"',
			
 
				+    "\\dots",
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def normalize_final_answer(final_answer: str) -> str:
			
 
				+    """
			
 
				+    Normalize a final answer to a quantitative reasoning question.
			
 
				+
			
 
				+    Copied character for character from appendix D of Lewkowycz et al. (2022)
			
 
				+    """
			
 
				+    final_answer = final_answer.split("=")[-1]
			
 
				+
			
 
				+    for before, after in SUBSTITUTIONS:
			
 
				+        final_answer = final_answer.replace(before, after)
			
 
				+    for expr in REMOVED_EXPRESSIONS:
			
 
				+        final_answer = final_answer.replace(expr, "")
			
 
				+
			
 
				+    # Extract answer that is in LaTeX math, is bold,
			
 
				+    # is surrounded by a box, etc.
			
 
				+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
			
 
				+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
			
 
				+
			
 
				+    # Normalize shorthand TeX:
			
 
				+    #  \fracab -> \frac{a}{b}
			
 
				+    #  \frac{abc}{bef} -> \frac{abc}{bef}
			
 
				+    #  \fracabc -> \frac{a}{b}c
			
 
				+    #  \sqrta -> \sqrt{a}
			
 
				+    #  \sqrtab -> sqrt{a}b
			
 
				+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
			
 
				+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
			
 
				+    final_answer = final_answer.replace("$", "")
			
 
				+
			
 
				+    # Normalize 100,000 -> 100000
			
 
				+    if final_answer.replace(",", "").isdigit():
			
 
				+        final_answer = final_answer.replace(",", "")
			
 
				+
			
 
				+    return final_answer
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/meta_instruct.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/meta_instruct.yaml
@@ -0,0 +1,6 @@
 
				+group: meta_instruct
			
 
				+task:
			
 
				+  - meta_ifeval
			
 
				+  - meta_math_hard
			
 
				+  - meta_gpqa
			
 
				+  - meta_mmlu_pro_instruct
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/meta_pretrain.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/meta_pretrain.yaml
@@ -0,0 +1,4 @@
 
				+group: meta_pretrain
			
 
				+task:
			
 
				+  - meta_bbh
			
 
				+  - meta_mmlu_pro_pretrain
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,28 @@
 
				+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
			
 
				+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
			
 
				+output_type: generate_until
			
 
				+task: meta_mmlu_pro_instruct
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: 'best answer is ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 1024
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -0,0 +1,27 @@
 
				+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
			
 
				+dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
			
 
				+output_type: generate_until
			
 
				+task: meta_mmlu_pro_pretrain
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        regex_pattern: 'answer is \(([A-Z])\)'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: "\n\nQ: "
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
@@ -0,0 +1,22 @@
 
				+import string
			
 
				+
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+            "choices": list(doc["input_choice_list"])
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/tools/benchmarks/meta_eval_reproduce/prepare_dataset.py
+++ b/tools/benchmarks/meta_eval_reproduce/prepare_dataset.py
@@ -0,0 +1,85 @@
 
				+from datasets import load_dataset,Dataset
			
 
				+
			
 
				+def get_ifeval_data(model_name,output_dir):
			
 
				+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
			
 
				+    original_dataset_name = "wis-k/instruction-following-eval"
			
 
				+    #meta_dataset_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-evals"
			
 
				+    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				+    meta_data = load_dataset(
			
 
				+        meta_dataset_name,
			
 
				+        name=f"{model_name}-evals__ifeval__strict__details",
			
 
				+        split="latest"
			
 
				+        )
			
 
				+    ifeval_data = load_dataset(
			
 
				+        original_dataset_name,
			
 
				+        split="train"
			
 
				+        )
			
 
				+    meta_data = meta_data.map(get_question)
			
 
				+    meta_df = meta_data.to_pandas()
			
 
				+    ifeval_df = ifeval_data.to_pandas()
			
 
				+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
			
 
				+    print("meta_df",meta_df.columns)
			
 
				+    print(meta_df)
			
 
				+    print("ifeval_df",ifeval_df.columns)
			
 
				+
			
 
				+    print(ifeval_df)
			
 
				+
			
 
				+    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
			
 
				+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
			
 
				+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
			
 
				+    joined = Dataset.from_pandas(joined)
			
 
				+    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
			
 
				+    joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    print(joined)
			
 
				+    for item in joined:
			
 
				+        check_sample(item)
			
 
				+    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
			
 
				+def get_math_data(model_name,output_dir):
			
 
				+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
			
 
				+    original_dataset_name = "lighteval/MATH-Hard"
			
 
				+    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				+    meta_data = load_dataset(
			
 
				+        meta_dataset_name,
			
 
				+        name=f"{model_name}-evals__math_hard__details",
			
 
				+        split="latest"
			
 
				+        )
			
 
				+    math_data = load_dataset(
			
 
				+        original_dataset_name,
			
 
				+        split="test"
			
 
				+        )
			
 
				+    meta_df = meta_data.to_pandas()
			
 
				+    math_df = math_data.to_pandas()
			
 
				+    math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				+    print("meta_df",meta_df.columns)
			
 
				+    print(meta_df)
			
 
				+    print("math_df",math_df.columns)
			
 
				+
			
 
				+    print(math_df)
			
 
				+
			
 
				+    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				+    # joined = Dataset.from_pandas(joined)
			
 
				+    # joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				+    # joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				+    # joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    print(joined)
			
 
				+    # for item in joined:
			
 
				+    #     check_sample(item)
			
 
				+    joined.to_parquet(output_dir + f"/joined_math.parquet")
			
 
				+    #joined.save_to_disk(output_dir + f"/joined_math")
			
 
				+def get_question(example):
			
 
				+    try:
			
 
				+        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
			
 
				+        example["input_final_prompts"] = example["input_final_prompts"][0]
			
 
				+        return example
			
 
				+    except:
			
 
				+        print(example["input_question"])
			
 
				+        return
			
 
				+def check_sample(example):
			
 
				+    if "kwargs" in example and not example["kwargs"]:
			
 
				+        print(example)
			
 
				+        raise ValueError("This example did not got joined for IFeval")
			
 
				+    if "solution" in example and not example["solution"]:
			
 
				+        print(example)
			
 
				+        raise ValueError("This example did not got joined for MATH_hard")