1 yıl önce · 091f71e80e
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
@@ -1,4 +1,4 @@
 
				-model_name: "meta-llama/Meta-Llama-3.1-Instruct-8B" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
			
@@ -19,7 +19,7 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
				 output_path: "eval_results" # the output folder to store all the eval results and samples.
			
 
				 
			
 
				 #limit: 12 # Limit number of examples per task, set 'null' to run all.
			
 
				-limit: null # Limit number of examples per task.
			
 
				+limit: null # Limit number of examples per task, set 'null' to run all.
			
 
				 
			
 
				 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
			
 
				 
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_eval.py
@@ -1,194 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-import argparse
			
 
				-import json
			
 
				-import logging
			
 
				-import os
			
 
				-import re
			
 
				-import sys
			
 
				-from pathlib import Path
			
 
				-import glob
			
 
				-import numpy as np
			
 
				-import lm_eval
			
 
				-from lm_eval import tasks
			
 
				-from lm_eval.utils import make_table
			
 
				-from prepare_dataset import get_ifeval_data, get_math_data
			
 
				-import shutil, errno
			
 
				-import yaml
			
 
				-from datetime import datetime
			
 
				-
			
 
				-def _handle_non_serializable(o):
			
 
				-    if isinstance(o, np.int64) or isinstance(o, np.int32):
			
 
				-        return int(o)
			
 
				-    elif isinstance(o, set):
			
 
				-        return list(o)
			
 
				-    else:
			
 
				-        return str(o)
			
 
				-
			
 
				-
			
 
				-def setup_logging(verbosity):
			
 
				-    logging.basicConfig(
			
 
				-        level=verbosity.upper(), format="%(asctime)s - %(levelname)s - %(message)s"
			
 
				-    )
			
 
				-    return logging.getLogger(__name__)
			
 
				-
			
 
				-def change_yaml(args, base_name):
			
 
				-    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
			
 
				-        with open(yaml_file, "r") as sources:
			
 
				-            lines = sources.readlines()
			
 
				-        output_path = yaml_file.replace(args.template_dir,args.work_dir)
			
 
				-        print(f"changing {yaml_file} to output_path: {output_path}")
			
 
				-        path = Path(output_path)
			
 
				-        yaml_dir = path.parent
			
 
				-        with open(output_path, "w") as output:
			
 
				-            for line in lines:
			
 
				-                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
			
 
				-def handle_output(args, results, logger):
			
 
				-    if not results:
			
 
				-        logger.error("No results found.")
			
 
				-        sys.exit(1)
			
 
				-    if not args.output_path:
			
 
				-        if args.log_samples:
			
 
				-            logger.error("Specify --output_path for logging samples.")
			
 
				-            sys.exit(1)
			
 
				-        return
			
 
				-
			
 
				-    if args.log_samples:
			
 
				-        samples = results.pop("samples")
			
 
				-    results_str = json.dumps(
			
 
				-        results, indent=2, default=_handle_non_serializable, ensure_ascii=False
			
 
				-    )
			
 
				-    if args.show_config:
			
 
				-        logger.info(results_str)
			
 
				-    date_id = datetime.now().isoformat().replace(":", "-")
			
 
				-    path = Path(args.output_path)
			
 
				-
			
 
				-
			
 
				-    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
			
 
				-    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-
			
 
				-    file_path = os.path.join(args.output_path, "eval_results_" + date_id + ".json")
			
 
				-    with open(file_path , "w", encoding="utf-8") as f:
			
 
				-        f.write(results_str)
			
 
				-
			
 
				-    if args.log_samples:
			
 
				-        for task_name, _ in results.get("configs", {}).items():
			
 
				-            output_name = task_name + "_"+ date_id + re.sub(r"/|=", "_", args.model_args.split(",")[0].replace("pretrained",""))
			
 
				-            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
			
 
				-            sample_data = json.dumps(
			
 
				-                samples.get(task_name, {}), indent=2, default=_handle_non_serializable
			
 
				-            )
			
 
				-            sample_file.write_text(sample_data, encoding="utf-8")
			
 
				-
			
 
				-    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
			
 
				-    summary = f"{args.model_name} ({args.model_args})"
			
 
				-    logger.info(summary)
			
 
				-    logger.info(make_table(results))
			
 
				-    if "groups" in results:
			
 
				-        logger.info(make_table(results, "groups"))
			
 
				-
			
 
				-
			
 
				-def load_tasks(args):
			
 
				-    if not args.tasks or "meta" not in args.tasks:
			
 
				-        raise ValueError("Please specify a valid meta task name")
			
 
				-    if args.tasks:
			
 
				-        tasks_list = args.tasks.split(",") 
			
 
				-    else:
			
 
				-        print("No tasks specified. Please try again")
			
 
				-        sys.exit(1)
			
 
				-    current_dir = os.getcwd()
			
 
				-    config_dir = os.path.join(current_dir, args.work_dir)
			
 
				-    print(f"Including the config_dir to task manager: {config_dir}")
			
 
				-    task_manager = tasks.TaskManager(include_path=config_dir)
			
 
				-    return task_manager, tasks_list
			
 
				-
			
 
				-def copy_and_prepare(args):
			
 
				-    if not os.path.exists(args.work_dir):
			
 
				-        # Copy the all files, including yaml files and python files, from template folder to the work folder
			
 
				-
			
 
				-        copy_dir(args.template_dir,args.work_dir)
			
 
				-    else:
			
 
				-        print("work_dir already exists, no need to copy files")
			
 
				-    # Use the template yaml to get the correct model name in work_dir yaml
			
 
				-    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
			
 
				-    change_yaml(args, base_name)
			
 
				-
			
 
				-def parse_eval_args():
			
 
				-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				-    parser.add_argument(
			
 
				-        "--config_path",
			
 
				-        type=str,
			
 
				-        default="./eval_config.yaml",
			
 
				-        help="the config yaml file that contains all the eval parameters",
			
 
				-    )
			
 
				-    return parser.parse_args()
			
 
				-
			
 
				-def prepare_datasets(task_list,args):
			
 
				-    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
			
 
				-    # model_name are derived from the evals_dataset name
			
 
				-    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
			
 
				-    if "meta_instruct" in task_list:
			
 
				-        get_ifeval_data(model_name,args.work_dir)
			
 
				-        
			
 
				-        get_math_data(model_name,args.work_dir)
			
 
				-    else:
			
 
				-        if "meta_ifeval" in task_list:
			
 
				-            get_ifeval_data(model_name,args.work_dir)
			
 
				-        if "meta_math_hard" in task_list:
			
 
				-            get_math_data(model_name,args.work_dir)
			
 
				-    
			
 
				-def evaluate_model(args):
			
 
				-    # Customized model such as Quantized model etc.
			
 
				-    # In case you are working with a custom model, you can use the following guide to add it here:
			
 
				-    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
			
 
				-    task_manager, task_list = load_tasks(args)
			
 
				-    logger.info(f"Loaded tasks: {task_list}")
			
 
				-    # We need to prepare the dataset for the IFeval and MATH_Hard tasks
			
 
				-    if "meta_instruct" in task_list or "meta_ifeval" in task_list or "meta_math_hard" in task_list:
			
 
				-        prepare_datasets(task_list, args)
			
 
				-    # Evaluate
			
 
				-    results = lm_eval.simple_evaluate(
			
 
				-        model="vllm",
			
 
				-        model_args=args.model_args,
			
 
				-        tasks=task_list,
			
 
				-        limit=args.limit,
			
 
				-        log_samples=args.log_samples,
			
 
				-        task_manager=task_manager,
			
 
				-        random_seed=42,
			
 
				-        numpy_random_seed=42,
			
 
				-        torch_random_seed=42,
			
 
				-        fewshot_random_seed=42
			
 
				-        )
			
 
				-    handle_output(args, results, logger)
			
 
				-
			
 
				-
			
 
				-def copy_dir(src, dst):
			
 
				-    try:
			
 
				-        shutil.copytree(src, dst)
			
 
				-    except OSError as exc: # python >2.5
			
 
				-        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
			
 
				-            shutil.copy(src, dst)
			
 
				-        else: raise
			
 
				-def load_config(config_path: str = "./config.yaml"):
			
 
				-    # Read the YAML configuration file
			
 
				-    with open(config_path, "r") as file:
			
 
				-        config = yaml.safe_load(file)
			
 
				-    return config
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    args = parse_eval_args()
			
 
				-    config = load_config(args.config_path)
			
 
				-    # Create VLLM model args
			
 
				-    for k,v in config.items():
			
 
				-        args.__setattr__(k,v)
			
 
				-    if not os.path.exists(args.template_dir):
			
 
				-        raise ValueError("The template_dir does not exist, please check the path")
			
 
				-    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
			
 
				-        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
			
 
				-    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				-    # Copy the all files from template folder to the work folder
			
 
				-    copy_and_prepare(args)
			
 
				-    logger = setup_logging(args.verbosity)
			
 
				-    evaluate_model(args)
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_dataset.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_dataset.py
@@ -1,71 +0,0 @@
 
				-from datasets import load_dataset,Dataset
			
 
				-
			
 
				-def get_ifeval_data(model_name,output_dir):
			
 
				-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
			
 
				-    original_dataset_name = "wis-k/instruction-following-eval"
			
 
				-    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				-    meta_data = load_dataset(
			
 
				-        meta_dataset_name,
			
 
				-        name=f"{model_name}-evals__ifeval__strict__details",
			
 
				-        split="latest"
			
 
				-        )
			
 
				-    ifeval_data = load_dataset(
			
 
				-        original_dataset_name,
			
 
				-        split="train"
			
 
				-        )
			
 
				-    meta_data = meta_data.map(get_question)
			
 
				-    meta_df = meta_data.to_pandas()
			
 
				-    ifeval_df = ifeval_data.to_pandas()
			
 
				-    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
			
 
				-
			
 
				-    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
			
 
				-    joined = joined.rename(columns={"input_final_prompts": "prompt"})
			
 
				-    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
			
 
				-    joined = Dataset.from_pandas(joined)
			
 
				-    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
			
 
				-    joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				-    for item in joined:
			
 
				-        check_sample(item)
			
 
				-    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
			
 
				-def get_math_data(model_name,output_dir):
			
 
				-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
			
 
				-    original_dataset_name = "lighteval/MATH-Hard"
			
 
				-    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				-    meta_data = load_dataset(
			
 
				-        meta_dataset_name,
			
 
				-        name=f"{model_name}-evals__math_hard__details",
			
 
				-        split="latest"
			
 
				-        )
			
 
				-    math_data = load_dataset(
			
 
				-        original_dataset_name,
			
 
				-        split="test"
			
 
				-        )
			
 
				-    meta_df = meta_data.to_pandas()
			
 
				-    math_df = math_data.to_pandas()
			
 
				-    math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				-
			
 
				-    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				-    joined = Dataset.from_pandas(joined)
			
 
				-    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				-    joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				-    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				-    for item in joined:
			
 
				-        check_sample(item)
			
 
				-    joined.to_parquet(output_dir + f"/joined_math.parquet")
			
 
				-def get_question(example):
			
 
				-    try:
			
 
				-        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
			
 
				-        example["input_final_prompts"] = example["input_final_prompts"][0]
			
 
				-        return example
			
 
				-    except:
			
 
				-        print(example["input_question"])
			
 
				-        return
			
 
				-def check_sample(example):
			
 
				-    if "kwargs" in example and not example["kwargs"]:
			
 
				-        print(example)
			
 
				-        raise ValueError("This example did not got joined for IFeval")
			
 
				-    if "solution" in example and not example["solution"]:
			
 
				-        print(example)
			
 
				-        raise ValueError("This example did not got joined for MATH_hard")
			
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
@@ -0,0 +1,176 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+import glob
			
 
				+import numpy as np
			
 
				+import lm_eval
			
 
				+from lm_eval import tasks
			
 
				+from lm_eval.utils import make_table
			
 
				+import shutil, errno
			
 
				+import yaml
			
 
				+from datasets import load_dataset,Dataset
			
 
				+
			
 
				+def get_ifeval_data(model_name,output_dir):
			
 
				+    print(f"preparing the ifeval data using {model_name}'s evals dataset")
			
 
				+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
			
 
				+    original_dataset_name = "wis-k/instruction-following-eval"
			
 
				+    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				+    meta_data = load_dataset(
			
 
				+        meta_dataset_name,
			
 
				+        name=f"{model_name}-evals__ifeval__strict__details",
			
 
				+        split="latest"
			
 
				+        )
			
 
				+    ifeval_data = load_dataset(
			
 
				+        original_dataset_name,
			
 
				+        split="train"
			
 
				+        )
			
 
				+    meta_data = meta_data.map(get_question)
			
 
				+    meta_df = meta_data.to_pandas()
			
 
				+    ifeval_df = ifeval_data.to_pandas()
			
 
				+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
			
 
				+
			
 
				+    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
			
 
				+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
			
 
				+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
			
 
				+    joined = Dataset.from_pandas(joined)
			
 
				+    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
			
 
				+    joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    for item in joined:
			
 
				+        check_sample(item)
			
 
				+    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
			
 
				+def get_math_data(model_name,output_dir):
			
 
				+    print(f"preparing the math data using {model_name}'s evals dataset")
			
 
				+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
			
 
				+    original_dataset_name = "lighteval/MATH-Hard"
			
 
				+    meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				+    meta_data = load_dataset(
			
 
				+        meta_dataset_name,
			
 
				+        name=f"{model_name}-evals__math_hard__details",
			
 
				+        split="latest"
			
 
				+        )
			
 
				+    math_data = load_dataset(
			
 
				+        original_dataset_name,
			
 
				+        split="test"
			
 
				+        )
			
 
				+    meta_df = meta_data.to_pandas()
			
 
				+    math_df = math_data.to_pandas()
			
 
				+    math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				+
			
 
				+    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				+    joined = Dataset.from_pandas(joined)
			
 
				+    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				+    joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				+    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    for item in joined:
			
 
				+        check_sample(item)
			
 
				+    joined.to_parquet(output_dir + f"/joined_math.parquet")
			
 
				+def get_question(example):
			
 
				+    try:
			
 
				+        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
			
 
				+        example["input_final_prompts"] = example["input_final_prompts"][0]
			
 
				+        return example
			
 
				+    except:
			
 
				+        print(example["input_question"])
			
 
				+        return
			
 
				+def check_sample(example):
			
 
				+    if "kwargs" in example and not example["kwargs"]:
			
 
				+        print(example)
			
 
				+        raise ValueError("This example did not got joined for IFeval")
			
 
				+    if "solution" in example and not example["solution"]:
			
 
				+        print(example)
			
 
				+        raise ValueError("This example did not got joined for MATH_hard")
			
 
				+
			
 
				+
			
 
				+def change_yaml(args, base_name):
			
 
				+    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
			
 
				+        with open(yaml_file, "r") as sources:
			
 
				+            lines = sources.readlines()
			
 
				+        output_path = yaml_file.replace(args.template_dir,args.work_dir)
			
 
				+        print(f"changing {yaml_file} to output_path: {output_path}")
			
 
				+        path = Path(output_path)
			
 
				+        yaml_dir = path.parent
			
 
				+        with open(output_path, "w") as output:
			
 
				+            for line in lines:
			
 
				+                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
			
 
				+
			
 
				+def copy_and_prepare(args):
			
 
				+    if not os.path.exists(args.work_dir):
			
 
				+        # Copy the all files, including yaml files and python files, from template folder to the work folder
			
 
				+
			
 
				+        copy_dir(args.template_dir,args.work_dir)
			
 
				+    else:
			
 
				+        print("work_dir already exists, no need to copy files")
			
 
				+    # Use the template yaml to get the correct model name in work_dir yaml
			
 
				+    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
			
 
				+    change_yaml(args, base_name)
			
 
				+
			
 
				+def parse_eval_args():
			
 
				+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				+    parser.add_argument(
			
 
				+        "--config_path",
			
 
				+        type=str,
			
 
				+        default="./eval_config.yaml",
			
 
				+        help="the config yaml file that contains all the eval parameters",
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+def prepare_datasets(args):
			
 
				+    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
			
 
				+    # model_name are derived from the evals_dataset name
			
 
				+    task_list = args.tasks.split(",")
			
 
				+    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
			
 
				+    if "meta_instruct" in task_list:
			
 
				+        get_ifeval_data(model_name,args.work_dir)
			
 
				+        
			
 
				+        get_math_data(model_name,args.work_dir)
			
 
				+    else:
			
 
				+        if "meta_ifeval" in task_list:
			
 
				+            get_ifeval_data(model_name,args.work_dir)
			
 
				+        if "meta_math_hard" in task_list:
			
 
				+            get_math_data(model_name,args.work_dir)
			
 
				+    
			
 
				+def copy_dir(src, dst):
			
 
				+    try:
			
 
				+        shutil.copytree(src, dst)
			
 
				+    except OSError as exc: # python >2.5
			
 
				+        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
			
 
				+            shutil.copy(src, dst)
			
 
				+        else: raise
			
 
				+def load_config(config_path: str = "./config.yaml"):
			
 
				+    # Read the YAML configuration file
			
 
				+    with open(config_path, "r") as file:
			
 
				+        config = yaml.safe_load(file)
			
 
				+    return config
			
 
				+if __name__ == "__main__":
			
 
				+    args = parse_eval_args()
			
 
				+    config = load_config(args.config_path)
			
 
				+    # Create VLLM model args
			
 
				+    for k,v in config.items():
			
 
				+        args.__setattr__(k,v)
			
 
				+    if not os.path.exists(args.template_dir):
			
 
				+        raise ValueError("The template_dir does not exist, please check the path")
			
 
				+    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
			
 
				+        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
			
 
				+    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				+    # Copy the all files from template folder to the work folder
			
 
				+    copy_and_prepare(args)
			
 
				+    prepare_datasets(args)
			
 
				+    print(f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}")
			
 
				+    command_str = f"lm_eval --model vllm   --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
			
 
				+    if args.limit:
			
 
				+        command_str += f" --limit {args.limit}"
			
 
				+    if args.log_samples:
			
 
				+        command_str += f" --log_samples "
			
 
				+    if args.show_config:
			
 
				+        command_str += f" --show_config "
			
 
				+    print("please use the following command to run the meta reproduce evals:")
			
 
				+    print(command_str)