1 год назад · 38e6de84a7
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
@@ -2,105 +2,154 @@
 
				 # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				 
			
 
				 import argparse
			
 
				+import errno, shutil
			
 
				+import glob
			
 
				 import os
			
 
				 from pathlib import Path
			
 
				-import glob
			
 
				-import shutil, errno
			
 
				+
			
 
				 import yaml
			
 
				-from datasets import load_dataset,Dataset
			
 
				+from datasets import Dataset, load_dataset
			
 
				+
			
 
				 
			
 
				 # get the ifeval  from the evals dataset and join it with the original ifeval datasets
			
 
				-def get_ifeval_data(model_name,output_dir):
			
 
				+def get_ifeval_data(model_name, output_dir):
			
 
				     print(f"preparing the ifeval data using {model_name}'s evals dataset")
			
 
				-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
			
 
				+    if model_name not in [
			
 
				+        "Meta-Llama-3.1-8B-Instruct",
			
 
				+        "Meta-Llama-3.1-70B-Instruct",
			
 
				+        "Meta-Llama-3.1-405B-Instruct",
			
 
				+    ]:
			
 
				+        raise ValueError(
			
 
				+            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval"
			
 
				+        )
			
 
				     original_dataset_name = "wis-k/instruction-following-eval"
			
 
				     meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				     meta_data = load_dataset(
			
 
				         meta_dataset_name,
			
 
				         name=f"{model_name}-evals__ifeval__strict__details",
			
 
				-        split="latest"
			
 
				-        )
			
 
				-    ifeval_data = load_dataset(
			
 
				-        original_dataset_name,
			
 
				-        split="train"
			
 
				-        )
			
 
				+        split="latest",
			
 
				+    )
			
 
				+    ifeval_data = load_dataset(original_dataset_name, split="train")
			
 
				     meta_data = meta_data.map(get_question)
			
 
				     meta_df = meta_data.to_pandas()
			
 
				     ifeval_df = ifeval_data.to_pandas()
			
 
				     ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
			
 
				     # join the two datasets on the input_question column
			
 
				-    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
			
 
				+    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
			
 
				     joined = joined.rename(columns={"input_final_prompts": "prompt"})
			
 
				     joined = joined.rename(columns={"is_correct": "previous_is_correct"})
			
 
				     joined = Dataset.from_pandas(joined)
			
 
				-    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
			
 
				-    joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    joined = joined.select_columns(
			
 
				+        [
			
 
				+            "input_question",
			
 
				+            "prompt",
			
 
				+            "previous_is_correct",
			
 
				+            "instruction_id_list",
			
 
				+            "kwargs",
			
 
				+            "output_prediction_text",
			
 
				+            "key",
			
 
				+        ]
			
 
				+    )
			
 
				+    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
			
 
				     joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
			
 
				 
			
 
				+
			
 
				 # get the math_hard data from the evals dataset and join it with the original math_hard dataset
			
 
				-def get_math_data(model_name,output_dir):
			
 
				+def get_math_data(model_name, output_dir):
			
 
				     print(f"preparing the math data using {model_name}'s evals dataset")
			
 
				-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
			
 
				+    if model_name not in [
			
 
				+        "Meta-Llama-3.1-8B-Instruct",
			
 
				+        "Meta-Llama-3.1-70B-Instruct",
			
 
				+        "Meta-Llama-3.1-405B-Instruct",
			
 
				+    ]:
			
 
				+        raise ValueError(
			
 
				+            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard"
			
 
				+        )
			
 
				     original_dataset_name = "lighteval/MATH-Hard"
			
 
				     meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				     meta_data = load_dataset(
			
 
				         meta_dataset_name,
			
 
				         name=f"{model_name}-evals__math_hard__details",
			
 
				-        split="latest"
			
 
				-        )
			
 
				-    math_data = load_dataset(
			
 
				-        original_dataset_name,
			
 
				-        split="test"
			
 
				-        )
			
 
				+        split="latest",
			
 
				+    )
			
 
				+    math_data = load_dataset(original_dataset_name, split="test")
			
 
				     meta_df = meta_data.to_pandas()
			
 
				     math_df = math_data.to_pandas()
			
 
				     math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				     # join the two datasets on the input_question column
			
 
				-    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				+    joined = meta_df.join(math_df.set_index("input_question"), on="input_question")
			
 
				     joined = Dataset.from_pandas(joined)
			
 
				-    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				-    joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				-    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    joined = joined.select_columns(
			
 
				+        [
			
 
				+            "input_question",
			
 
				+            "input_correct_responses",
			
 
				+            "input_final_prompts",
			
 
				+            "is_correct",
			
 
				+            "solution",
			
 
				+            "output_prediction_text",
			
 
				+        ]
			
 
				+    )
			
 
				+    joined = joined.rename_column("is_correct", "previous_is_correct")
			
 
				+    joined = joined.rename_column(
			
 
				+        "output_prediction_text", "previous_output_prediction_text"
			
 
				+    )
			
 
				 
			
 
				     joined.to_parquet(output_dir + f"/joined_math.parquet")
			
 
				 
			
 
				-# get the question from the ifeval dataset 
			
 
				+
			
 
				+# get the question from the ifeval dataset
			
 
				 def get_question(example):
			
 
				     try:
			
 
				-        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
			
 
				+        example["input_question"] = (
			
 
				+            eval(
			
 
				+                example["input_question"]
			
 
				+                .replace("null", "None")
			
 
				+                .replace("true", "True")
			
 
				+                .replace("false", "False")
			
 
				+            )["dialog"][0]["body"]
			
 
				+            .replace("Is it True that the first song", "Is it true that the first song")
			
 
				+            .replace("Is the following True", "Is the following true")
			
 
				+        )
			
 
				         example["input_final_prompts"] = example["input_final_prompts"][0]
			
 
				         return example
			
 
				     except:
			
 
				         print(example["input_question"])
			
 
				         return
			
 
				 
			
 
				+
			
 
				 # change the yaml file to use the correct model name
			
 
				 def change_yaml(args, base_name):
			
 
				-    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
			
 
				+    for yaml_file in glob.glob(args.template_dir + "**/*/*.yaml", recursive=True):
			
 
				         with open(yaml_file, "r") as sources:
			
 
				             lines = sources.readlines()
			
 
				-        output_path = yaml_file.replace(args.template_dir,args.work_dir)
			
 
				+        output_path = yaml_file.replace(args.template_dir, args.work_dir)
			
 
				         print(f"changing {yaml_file} to output_path: {output_path}")
			
 
				         path = Path(output_path)
			
 
				         yaml_dir = path.parent
			
 
				         with open(output_path, "w") as output:
			
 
				             for line in lines:
			
 
				-                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
			
 
				+                output.write(
			
 
				+                    line.replace("Meta-Llama-3.1-8B", base_name).replace(
			
 
				+                        "WORK_DIR", str(yaml_dir)
			
 
				+                    )
			
 
				+                )
			
 
				+
			
 
				 
			
 
				 # copy the files and change the yaml file to use the correct model name
			
 
				 def copy_and_prepare(args):
			
 
				     if not os.path.exists(args.work_dir):
			
 
				         # Copy the all files, including yaml files and python files, from template folder to the work folder
			
 
				 
			
 
				-        copy_dir(args.template_dir,args.work_dir)
			
 
				+        copy_dir(args.template_dir, args.work_dir)
			
 
				     else:
			
 
				         print("work_dir already exists, no need to copy files")
			
 
				     # Use the template yaml to get the correct model name in work_dir yaml
			
 
				-    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
			
 
				+    base_name = (
			
 
				+        args.evals_dataset.split("/")[-1].replace("-evals", "").replace("-Instruct", "")
			
 
				+    )
			
 
				     change_yaml(args, base_name)
			
 
				 
			
 
				+
			
 
				 def parse_eval_args():
			
 
				     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
			
 
				     parser.add_argument(
			
@@ -111,50 +160,69 @@ def parse_eval_args():
 
				     )
			
 
				     return parser.parse_args()
			
 
				 
			
 
				+
			
 
				 def prepare_datasets(args):
			
 
				     # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
			
 
				     # model_name are derived from the evals_dataset name
			
 
				     task_list = args.tasks.split(",")
			
 
				-    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
			
 
				+    model_name = args.evals_dataset.split("/")[-1].replace("-evals", "")
			
 
				     if "meta_instruct" in task_list:
			
 
				-        get_ifeval_data(model_name,args.work_dir)
			
 
				-        
			
 
				-        get_math_data(model_name,args.work_dir)
			
 
				+        get_ifeval_data(model_name, args.work_dir)
			
 
				+
			
 
				+        get_math_data(model_name, args.work_dir)
			
 
				     else:
			
 
				         if "meta_ifeval" in task_list:
			
 
				-            get_ifeval_data(model_name,args.work_dir)
			
 
				+            get_ifeval_data(model_name, args.work_dir)
			
 
				         if "meta_math_hard" in task_list:
			
 
				-            get_math_data(model_name,args.work_dir)
			
 
				+            get_math_data(model_name, args.work_dir)
			
 
				+
			
 
				+
			
 
				 # copy the files from src to dst
			
 
				 def copy_dir(src, dst):
			
 
				     try:
			
 
				         shutil.copytree(src, dst)
			
 
				-    except OSError as exc: # python >2.5
			
 
				+    except OSError as exc:  # python >2.5
			
 
				         if exc.errno in (errno.ENOTDIR, errno.EINVAL):
			
 
				             shutil.copy(src, dst)
			
 
				-        else: raise
			
 
				+        else:
			
 
				+            raise
			
 
				+
			
 
				+
			
 
				 # load the config yaml file
			
 
				 def load_config(config_path: str = "./config.yaml"):
			
 
				     # Read the YAML configuration file
			
 
				     with open(config_path, "r") as file:
			
 
				         config = yaml.safe_load(file)
			
 
				     return config
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     args = parse_eval_args()
			
 
				     config = load_config(args.config_path)
			
 
				     # Create VLLM model args
			
 
				-    for k,v in config.items():
			
 
				-        args.__setattr__(k,v)
			
 
				+    for k, v in config.items():
			
 
				+        args.__setattr__(k, v)
			
 
				     if not os.path.exists(args.template_dir):
			
 
				         raise ValueError("The template_dir does not exist, please check the path")
			
 
				-    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
			
 
				-        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
			
 
				+    if args.evals_dataset not in [
			
 
				+        "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
			
 
				+        "meta-llama/Meta-Llama-3.1-70B-Instruct-evals",
			
 
				+        "meta-llama/Meta-Llama-3.1-405B-Instruct-evals",
			
 
				+        "meta-llama/Meta-Llama-3.1-8B-evals",
			
 
				+        "meta-llama/Meta-Llama-3.1-70B-evals",
			
 
				+        "meta-llama/Meta-Llama-3.1-405B-evals",
			
 
				+    ]:
			
 
				+        raise ValueError(
			
 
				+            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"
			
 
				+        )
			
 
				     args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
			
 
				     # Copy the all files from template folder to the work folder
			
 
				     copy_and_prepare(args)
			
 
				     # Prepare the datasets for the IFeval and MATH_Hard tasks as we need to join the original dataset
			
 
				     prepare_datasets(args)
			
 
				-    print(f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}")
			
 
				+    print(
			
 
				+        f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}"
			
 
				+    )
			
 
				     command_str = f"lm_eval --model vllm   --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
			
 
				     if args.limit:
			
 
				         command_str += f" --limit {args.limit}"