Browse Source

first commit, local not working

Kai Wu 9 months ago
parent
commit
9f0acebe02

+ 1 - 0
tools/benchmarks/meta_eval_reproduce/README.md

@@ -0,0 +1 @@
+python meta_eval.py

+ 32 - 0
tools/benchmarks/meta_eval_reproduce/eval_config.yaml

@@ -0,0 +1,32 @@
+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+
+evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+
+tasks: "meta_math_hard" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
+
+tensor_parallel_size: 2 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
+
+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+
+gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
+
+max_model_len: 8192 #The VLLM argument that speicify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
+
+batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
+
+output_path: "eval_results" # the output folder to store all the eval results and samples.
+
+limit: 16 # Limit number of examples per task, set 'null' to run all.
+#limit: null # Limit number of examples per task.
+
+verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
+
+log_samples: true # If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.
+
+work_dir: ./work_dir # The work folder where the task template yaml files will be copied and modified, datasets will be downloaded for math_hard, ifeval.
+
+template_dir: ./meta_template #Path to the folder that contains all the meta task templates
+
+show_config: false # If True, shows the full config of all tasks at the end of the evaluation.

+ 194 - 0
tools/benchmarks/meta_eval_reproduce/meta_eval.py

@@ -0,0 +1,194 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+import glob
+import numpy as np
+import lm_eval
+from lm_eval import tasks
+from lm_eval.utils import make_table
+from prepare_dataset import get_ifeval_data, get_math_data
+import shutil, errno
+import yaml
+from datetime import datetime
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
+def setup_logging(verbosity):
+    logging.basicConfig(
+        level=verbosity.upper(), format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+    return logging.getLogger(__name__)
+
+def change_yaml(args, base_name):
+    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
+        with open(yaml_file, "r") as sources:
+            lines = sources.readlines()
+        output_path = yaml_file.replace(args.template_dir,args.work_dir)
+        print(f"changing {yaml_file} to output_path: {output_path}")
+        path = Path(output_path)
+        yaml_dir = path.parent
+        with open(output_path, "w") as output:
+            for line in lines:
+                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
+def handle_output(args, results, logger):
+    if not results:
+        logger.error("No results found.")
+        sys.exit(1)
+    if not args.output_path:
+        if args.log_samples:
+            logger.error("Specify --output_path for logging samples.")
+            sys.exit(1)
+        return
+
+    if args.log_samples:
+        samples = results.pop("samples")
+    results_str = json.dumps(
+        results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+    )
+    if args.show_config:
+        logger.info(results_str)
+    date_id = datetime.now().isoformat().replace(":", "-")
+    path = Path(args.output_path)
+
+
+    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+
+    file_path = os.path.join(args.output_path, "eval_results_" + date_id + ".json")
+    with open(file_path , "w", encoding="utf-8") as f:
+        f.write(results_str)
+
+    if args.log_samples:
+        for task_name, _ in results.get("configs", {}).items():
+            output_name = task_name + "_"+ date_id + re.sub(r"/|=", "_", args.model_args.split(",")[0].replace("pretrained",""))
+            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
+            sample_data = json.dumps(
+                samples.get(task_name, {}), indent=2, default=_handle_non_serializable
+            )
+            sample_file.write_text(sample_data, encoding="utf-8")
+
+    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
+    summary = f"{args.model_name} ({args.model_args})"
+    logger.info(summary)
+    logger.info(make_table(results))
+    if "groups" in results:
+        logger.info(make_table(results, "groups"))
+
+
+def load_tasks(args):
+    if not args.tasks or "meta" not in args.tasks:
+        raise ValueError("Please specify a valid meta task name")
+    if args.tasks:
+        tasks_list = args.tasks.split(",") 
+    else:
+        print("No tasks specified. Please try again")
+        sys.exit(1)
+    current_dir = os.getcwd()
+    config_dir = os.path.join(current_dir, args.work_dir)
+    print(f"Including the config_dir to task manager: {config_dir}")
+    task_manager = tasks.TaskManager(include_path=config_dir)
+    return task_manager, tasks_list
+
+def copy_and_prepare(args):
+    if not os.path.exists(args.work_dir):
+        # Copy the all files, including yaml files and python files, from template folder to the work folder
+
+        copy_dir(args.template_dir,args.work_dir)
+    else:
+        print("work_dir already exists, no need to copy files")
+    # Use the template yaml to get the correct model name in work_dir yaml
+    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
+    change_yaml(args, base_name)
+
+def parse_eval_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="./eval_config.yaml",
+        help="the config yaml file that contains all the eval parameters",
+    )
+    return parser.parse_args()
+
+def prepare_datasets(task_list,args):
+    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
+    # model_name are derived from the evals_dataset name
+    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
+    if "meta_instruct" in task_list:
+        get_ifeval_data(model_name,args.work_dir)
+        
+        get_math_data(model_name,args.work_dir)
+    else:
+        if "meta_ifeval" in task_list:
+            get_ifeval_data(model_name,args.work_dir)
+        if "meta_math_hard" in task_list:
+            get_math_data(model_name,args.work_dir)
+    
+def evaluate_model(args):
+    # Customized model such as Quantized model etc.
+    # In case you are working with a custom model, you can use the following guide to add it here:
+    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
+    task_manager, task_list = load_tasks(args)
+    logger.info(f"Loaded tasks: {task_list}")
+    # We need to prepare the dataset for the IFeval and MATH_Hard tasks
+    if "meta_instruct" in task_list or "meta_ifeval" in task_list or "meta_math_hard" in task_list:
+        prepare_datasets(task_list, args)
+    # Evaluate
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=args.model_args,
+        tasks=task_list,
+        limit=args.limit,
+        log_samples=args.log_samples,
+        task_manager=task_manager,
+        random_seed=42,
+        numpy_random_seed=42,
+        torch_random_seed=42,
+        fewshot_random_seed=42
+        )
+    handle_output(args, results, logger)
+
+
+def copy_dir(src, dst):
+    try:
+        shutil.copytree(src, dst)
+    except OSError as exc: # python >2.5
+        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
+            shutil.copy(src, dst)
+        else: raise
+def load_config(config_path: str = "./config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+
+if __name__ == "__main__":
+    args = parse_eval_args()
+    config = load_config(args.config_path)
+    # Create VLLM model args
+    for k,v in config.items():
+        args.__setattr__(k,v)
+    if not os.path.exists(args.template_dir):
+        raise ValueError("The template_dir does not exist, please check the path")
+    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
+        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
+    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
+    # Copy the all files from template folder to the work folder
+    copy_and_prepare(args)
+    logger = setup_logging(args.verbosity)
+    evaluate_model(args)

+ 28 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml

@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
+dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
+task: meta_bbh
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'the answer is (.*?)\.'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 21 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/bbh/utils.py

@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 29 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml

@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
+task: meta_gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 21 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/utils.py

@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+            "choices": list(doc["input_choice_list"])
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 32 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml

@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: json
+dataset_kwargs:
+  data_files: ../joined_ifeval.json
+output_type: generate_until
+test_split: train
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n

+ 139 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/utils.py

@@ -0,0 +1,139 @@
+import dataclasses
+from typing import Dict, Optional, Union
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+                
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc

+ 46 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py

@@ -0,0 +1,46 @@
+from datasets import load_dataset,Dataset
+import os
+import yaml
+# def check_sample(example):
+#     if "kwargs" in example and not example["kwargs"]:
+#         print(example)
+#         raise ValueError("This example did not got ds for IFeval")
+#     if "solution" in example and not example["solution"]:
+#         print(example)
+#         raise ValueError("This example did not got ds for MATH_hard")
+def load_config(config_path: str = "./eval_config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+# current_dir = os.getcwd()
+# print("current_dir",current_dir)
+# yaml = load_config(str(current_dir)+"/eval_config.yaml")
+# meta_dataset_name = yaml["evals_dataset"]
+# model_name = meta_dataset_name.split("/")[-1].replace("-evals","")
+# original_dataset_name = "lighteval/MATH-Hard"
+
+# meta_data = load_dataset(
+#     meta_dataset_name,
+#     name=f"{model_name}-evals__math_hard__details",
+#     split="latest"
+#     )
+# math_data = load_dataset(
+#     original_dataset_name,
+#     split="test"
+#     )
+# meta_df = meta_data.to_pandas()
+# math_df = math_data.to_pandas()
+# math_df = math_df.rename(columns={"problem": "input_question"})
+
+# joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
+# ds = Dataset.from_pandas(joined)
+# ds = ds.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
+# ds = ds.rename_column("is_correct","previous_is_correct")
+# ds = ds.rename_column("output_prediction_text","previous_output_prediction_text")
+from datasets import load_dataset
+current_dir = os.getcwd()
+print("current_dir",current_dir)
+yaml = load_config(str(current_dir)+"/eval_config.yaml")
+work_dir = yaml["work_dir"]
+load_dataset('parquet', data_files=str(current_dir)+"/"+work_dir+"/joined_math.parquet")

+ 26 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml

@@ -0,0 +1,26 @@
+dataset_path: parquet
+dataset_name: null
+dataset_kwargs:
+  data_files:
+    train: /home/kaiwu/work/llama-recipes/tools/benchmarks/meta_eval_reproduce/work_dir/joined_math.parquet
+  trust_remote_code: true
+task: meta_math_hard
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 5120
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true

+ 303 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py

@@ -0,0 +1,303 @@
+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
+import re
+import signal
+from typing import Dict, List, Optional
+
+import datasets
+
+from lm_eval.utils import eval_logger
+
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": normalize_final_answer(
+                 remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+            "meta_target": doc["input_correct_responses"]
+        }
+        return out_doc
+    #dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    return dataset.map(_process_doc)
+
+
+
+def extract_result_from_boxed(answer: str) -> str:
+    box_start = "\\boxed"
+    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
+    start = answer.rfind(box_start)
+    if start < 0:
+        return ""
+    answer = answer[start + len(box_start) :].strip()
+    ends_with_curly = answer.startswith("{")
+    i = 0
+    open_braces = 0
+    while i < len(answer):
+        if answer[i] == "{":
+            open_braces += 1
+        elif answer[i] == "}":
+            open_braces -= 1
+        if open_braces == 0:
+            if ends_with_curly:
+                answer = answer[: i + 1].strip()
+                break
+            elif answer[i] == "$":
+                answer = answer[:i].strip()
+                break
+        i += 1
+    else:
+        return ""
+    # remove extra curly braces
+    while True:
+        if answer.startswith("{") and answer.endswith("}"):
+            answer = answer[1:-1].strip()
+        else:
+            break
+    return answer
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+
+    unnormalized_answer = get_unnormalized_answer(candidates)
+    if unnormalized_answer == "[invalidanswer]":
+        unnormalized_answer = extract_result_from_boxed(candidates)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer

+ 6 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/meta_instruct.yaml

@@ -0,0 +1,6 @@
+group: meta_instruct
+task:
+  - meta_ifeval
+  - meta_math_hard
+  - meta_gpqa
+  - meta_mmlu_pro_instruct

+ 4 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/meta_pretrain.yaml

@@ -0,0 +1,4 @@
+group: meta_pretrain
+task:
+  - meta_bbh
+  - meta_mmlu_pro_pretrain

+ 28 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml

@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+output_type: generate_until
+task: meta_mmlu_pro_instruct
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 27 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml

@@ -0,0 +1,27 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
+dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
+output_type: generate_until
+task: meta_mmlu_pro_pretrain
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'answer is \(([A-Z])\)'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 22 - 0
tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/utils.py

@@ -0,0 +1,22 @@
+import string
+
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+            "choices": list(doc["input_choice_list"])
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 85 - 0
tools/benchmarks/meta_eval_reproduce/prepare_dataset.py

@@ -0,0 +1,85 @@
+from datasets import load_dataset,Dataset
+
+def get_ifeval_data(model_name,output_dir):
+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
+    original_dataset_name = "wis-k/instruction-following-eval"
+    #meta_dataset_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-evals"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__ifeval__strict__details",
+        split="latest"
+        )
+    ifeval_data = load_dataset(
+        original_dataset_name,
+        split="train"
+        )
+    meta_data = meta_data.map(get_question)
+    meta_df = meta_data.to_pandas()
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+    print("meta_df",meta_df.columns)
+    print(meta_df)
+    print("ifeval_df",ifeval_df.columns)
+
+    print(ifeval_df)
+
+    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
+    joined.rename_column("output_prediction_text","previous_output_prediction_text")
+    print(joined)
+    for item in joined:
+        check_sample(item)
+    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
+def get_math_data(model_name,output_dir):
+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
+    original_dataset_name = "lighteval/MATH-Hard"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__math_hard__details",
+        split="latest"
+        )
+    math_data = load_dataset(
+        original_dataset_name,
+        split="test"
+        )
+    meta_df = meta_data.to_pandas()
+    math_df = math_data.to_pandas()
+    math_df = math_df.rename(columns={"problem": "input_question"})
+    print("meta_df",meta_df.columns)
+    print(meta_df)
+    print("math_df",math_df.columns)
+
+    print(math_df)
+
+    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
+    # joined = Dataset.from_pandas(joined)
+    # joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
+    # joined = joined.rename_column("is_correct","previous_is_correct")
+    # joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
+    print(joined)
+    # for item in joined:
+    #     check_sample(item)
+    joined.to_parquet(output_dir + f"/joined_math.parquet")
+    #joined.save_to_disk(output_dir + f"/joined_math")
+def get_question(example):
+    try:
+        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+    except:
+        print(example["input_question"])
+        return
+def check_sample(example):
+    if "kwargs" in example and not example["kwargs"]:
+        print(example)
+        raise ValueError("This example did not got joined for IFeval")
+    if "solution" in example and not example["solution"]:
+        print(example)
+        raise ValueError("This example did not got joined for MATH_hard")