Kaynağa Gözat

now uses lm_eval cli instead

Kai Wu 8 ay önce
ebeveyn
işleme
091f71e80e

Dosya farkı çok büyük olduğundan ihmal edildi
+ 37 - 14
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md


+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml

@@ -1,4 +1,4 @@
-model_name: "meta-llama/Meta-Llama-3.1-Instruct-8B" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
@@ -19,7 +19,7 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 output_path: "eval_results" # the output folder to store all the eval results and samples.
 
 #limit: 12 # Limit number of examples per task, set 'null' to run all.
-limit: null # Limit number of examples per task.
+limit: null # Limit number of examples per task, set 'null' to run all.
 
 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
 

+ 0 - 194
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_eval.py

@@ -1,194 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-import argparse
-import json
-import logging
-import os
-import re
-import sys
-from pathlib import Path
-import glob
-import numpy as np
-import lm_eval
-from lm_eval import tasks
-from lm_eval.utils import make_table
-from prepare_dataset import get_ifeval_data, get_math_data
-import shutil, errno
-import yaml
-from datetime import datetime
-
-def _handle_non_serializable(o):
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
-
-
-def setup_logging(verbosity):
-    logging.basicConfig(
-        level=verbosity.upper(), format="%(asctime)s - %(levelname)s - %(message)s"
-    )
-    return logging.getLogger(__name__)
-
-def change_yaml(args, base_name):
-    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
-        with open(yaml_file, "r") as sources:
-            lines = sources.readlines()
-        output_path = yaml_file.replace(args.template_dir,args.work_dir)
-        print(f"changing {yaml_file} to output_path: {output_path}")
-        path = Path(output_path)
-        yaml_dir = path.parent
-        with open(output_path, "w") as output:
-            for line in lines:
-                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
-def handle_output(args, results, logger):
-    if not results:
-        logger.error("No results found.")
-        sys.exit(1)
-    if not args.output_path:
-        if args.log_samples:
-            logger.error("Specify --output_path for logging samples.")
-            sys.exit(1)
-        return
-
-    if args.log_samples:
-        samples = results.pop("samples")
-    results_str = json.dumps(
-        results, indent=2, default=_handle_non_serializable, ensure_ascii=False
-    )
-    if args.show_config:
-        logger.info(results_str)
-    date_id = datetime.now().isoformat().replace(":", "-")
-    path = Path(args.output_path)
-
-
-    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-
-    file_path = os.path.join(args.output_path, "eval_results_" + date_id + ".json")
-    with open(file_path , "w", encoding="utf-8") as f:
-        f.write(results_str)
-
-    if args.log_samples:
-        for task_name, _ in results.get("configs", {}).items():
-            output_name = task_name + "_"+ date_id + re.sub(r"/|=", "_", args.model_args.split(",")[0].replace("pretrained",""))
-            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
-            sample_data = json.dumps(
-                samples.get(task_name, {}), indent=2, default=_handle_non_serializable
-            )
-            sample_file.write_text(sample_data, encoding="utf-8")
-
-    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
-    summary = f"{args.model_name} ({args.model_args})"
-    logger.info(summary)
-    logger.info(make_table(results))
-    if "groups" in results:
-        logger.info(make_table(results, "groups"))
-
-
-def load_tasks(args):
-    if not args.tasks or "meta" not in args.tasks:
-        raise ValueError("Please specify a valid meta task name")
-    if args.tasks:
-        tasks_list = args.tasks.split(",") 
-    else:
-        print("No tasks specified. Please try again")
-        sys.exit(1)
-    current_dir = os.getcwd()
-    config_dir = os.path.join(current_dir, args.work_dir)
-    print(f"Including the config_dir to task manager: {config_dir}")
-    task_manager = tasks.TaskManager(include_path=config_dir)
-    return task_manager, tasks_list
-
-def copy_and_prepare(args):
-    if not os.path.exists(args.work_dir):
-        # Copy the all files, including yaml files and python files, from template folder to the work folder
-
-        copy_dir(args.template_dir,args.work_dir)
-    else:
-        print("work_dir already exists, no need to copy files")
-    # Use the template yaml to get the correct model name in work_dir yaml
-    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
-    change_yaml(args, base_name)
-
-def parse_eval_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        default="./eval_config.yaml",
-        help="the config yaml file that contains all the eval parameters",
-    )
-    return parser.parse_args()
-
-def prepare_datasets(task_list,args):
-    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
-    # model_name are derived from the evals_dataset name
-    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
-    if "meta_instruct" in task_list:
-        get_ifeval_data(model_name,args.work_dir)
-        
-        get_math_data(model_name,args.work_dir)
-    else:
-        if "meta_ifeval" in task_list:
-            get_ifeval_data(model_name,args.work_dir)
-        if "meta_math_hard" in task_list:
-            get_math_data(model_name,args.work_dir)
-    
-def evaluate_model(args):
-    # Customized model such as Quantized model etc.
-    # In case you are working with a custom model, you can use the following guide to add it here:
-    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
-    task_manager, task_list = load_tasks(args)
-    logger.info(f"Loaded tasks: {task_list}")
-    # We need to prepare the dataset for the IFeval and MATH_Hard tasks
-    if "meta_instruct" in task_list or "meta_ifeval" in task_list or "meta_math_hard" in task_list:
-        prepare_datasets(task_list, args)
-    # Evaluate
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=args.model_args,
-        tasks=task_list,
-        limit=args.limit,
-        log_samples=args.log_samples,
-        task_manager=task_manager,
-        random_seed=42,
-        numpy_random_seed=42,
-        torch_random_seed=42,
-        fewshot_random_seed=42
-        )
-    handle_output(args, results, logger)
-
-
-def copy_dir(src, dst):
-    try:
-        shutil.copytree(src, dst)
-    except OSError as exc: # python >2.5
-        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
-            shutil.copy(src, dst)
-        else: raise
-def load_config(config_path: str = "./config.yaml"):
-    # Read the YAML configuration file
-    with open(config_path, "r") as file:
-        config = yaml.safe_load(file)
-    return config
-
-if __name__ == "__main__":
-    args = parse_eval_args()
-    config = load_config(args.config_path)
-    # Create VLLM model args
-    for k,v in config.items():
-        args.__setattr__(k,v)
-    if not os.path.exists(args.template_dir):
-        raise ValueError("The template_dir does not exist, please check the path")
-    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
-        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
-    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
-    # Copy the all files from template folder to the work folder
-    copy_and_prepare(args)
-    logger = setup_logging(args.verbosity)
-    evaluate_model(args)

+ 0 - 71
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_dataset.py

@@ -1,71 +0,0 @@
-from datasets import load_dataset,Dataset
-
-def get_ifeval_data(model_name,output_dir):
-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
-    original_dataset_name = "wis-k/instruction-following-eval"
-    meta_dataset_name = f"meta-llama/{model_name}-evals"
-    meta_data = load_dataset(
-        meta_dataset_name,
-        name=f"{model_name}-evals__ifeval__strict__details",
-        split="latest"
-        )
-    ifeval_data = load_dataset(
-        original_dataset_name,
-        split="train"
-        )
-    meta_data = meta_data.map(get_question)
-    meta_df = meta_data.to_pandas()
-    ifeval_df = ifeval_data.to_pandas()
-    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
-
-    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
-    joined = joined.rename(columns={"input_final_prompts": "prompt"})
-    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
-    joined = Dataset.from_pandas(joined)
-    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
-    joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    for item in joined:
-        check_sample(item)
-    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
-def get_math_data(model_name,output_dir):
-    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
-        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
-    original_dataset_name = "lighteval/MATH-Hard"
-    meta_dataset_name = f"meta-llama/{model_name}-evals"
-    meta_data = load_dataset(
-        meta_dataset_name,
-        name=f"{model_name}-evals__math_hard__details",
-        split="latest"
-        )
-    math_data = load_dataset(
-        original_dataset_name,
-        split="test"
-        )
-    meta_df = meta_data.to_pandas()
-    math_df = math_data.to_pandas()
-    math_df = math_df.rename(columns={"problem": "input_question"})
-
-    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
-    joined = Dataset.from_pandas(joined)
-    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
-    joined = joined.rename_column("is_correct","previous_is_correct")
-    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    for item in joined:
-        check_sample(item)
-    joined.to_parquet(output_dir + f"/joined_math.parquet")
-def get_question(example):
-    try:
-        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
-        example["input_final_prompts"] = example["input_final_prompts"][0]
-        return example
-    except:
-        print(example["input_question"])
-        return
-def check_sample(example):
-    if "kwargs" in example and not example["kwargs"]:
-        print(example)
-        raise ValueError("This example did not got joined for IFeval")
-    if "solution" in example and not example["solution"]:
-        print(example)
-        raise ValueError("This example did not got joined for MATH_hard")

+ 176 - 0
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py

@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+import glob
+import numpy as np
+import lm_eval
+from lm_eval import tasks
+from lm_eval.utils import make_table
+import shutil, errno
+import yaml
+from datasets import load_dataset,Dataset
+
+def get_ifeval_data(model_name,output_dir):
+    print(f"preparing the ifeval data using {model_name}'s evals dataset")
+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
+    original_dataset_name = "wis-k/instruction-following-eval"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__ifeval__strict__details",
+        split="latest"
+        )
+    ifeval_data = load_dataset(
+        original_dataset_name,
+        split="train"
+        )
+    meta_data = meta_data.map(get_question)
+    meta_df = meta_data.to_pandas()
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+
+    joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
+    joined.rename_column("output_prediction_text","previous_output_prediction_text")
+    for item in joined:
+        check_sample(item)
+    joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
+def get_math_data(model_name,output_dir):
+    print(f"preparing the math data using {model_name}'s evals dataset")
+    if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
+        raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
+    original_dataset_name = "lighteval/MATH-Hard"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__math_hard__details",
+        split="latest"
+        )
+    math_data = load_dataset(
+        original_dataset_name,
+        split="test"
+        )
+    meta_df = meta_data.to_pandas()
+    math_df = math_data.to_pandas()
+    math_df = math_df.rename(columns={"problem": "input_question"})
+
+    joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
+    joined = joined.rename_column("is_correct","previous_is_correct")
+    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
+    for item in joined:
+        check_sample(item)
+    joined.to_parquet(output_dir + f"/joined_math.parquet")
+def get_question(example):
+    try:
+        example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+    except:
+        print(example["input_question"])
+        return
+def check_sample(example):
+    if "kwargs" in example and not example["kwargs"]:
+        print(example)
+        raise ValueError("This example did not got joined for IFeval")
+    if "solution" in example and not example["solution"]:
+        print(example)
+        raise ValueError("This example did not got joined for MATH_hard")
+
+
+def change_yaml(args, base_name):
+    for yaml_file in glob.glob(args.template_dir+'**/*/*.yaml', recursive=True):       
+        with open(yaml_file, "r") as sources:
+            lines = sources.readlines()
+        output_path = yaml_file.replace(args.template_dir,args.work_dir)
+        print(f"changing {yaml_file} to output_path: {output_path}")
+        path = Path(output_path)
+        yaml_dir = path.parent
+        with open(output_path, "w") as output:
+            for line in lines:
+                output.write(line.replace("Meta-Llama-3.1-8B",base_name).replace("WORK_DIR",str(yaml_dir)))
+
+def copy_and_prepare(args):
+    if not os.path.exists(args.work_dir):
+        # Copy the all files, including yaml files and python files, from template folder to the work folder
+
+        copy_dir(args.template_dir,args.work_dir)
+    else:
+        print("work_dir already exists, no need to copy files")
+    # Use the template yaml to get the correct model name in work_dir yaml
+    base_name = args.evals_dataset.split("/")[-1].replace("-evals","").replace("-Instruct","")
+    change_yaml(args, base_name)
+
+def parse_eval_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="./eval_config.yaml",
+        help="the config yaml file that contains all the eval parameters",
+    )
+    return parser.parse_args()
+
+def prepare_datasets(args):
+    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
+    # model_name are derived from the evals_dataset name
+    task_list = args.tasks.split(",")
+    model_name = args.evals_dataset.split("/")[-1].replace("-evals","")
+    if "meta_instruct" in task_list:
+        get_ifeval_data(model_name,args.work_dir)
+        
+        get_math_data(model_name,args.work_dir)
+    else:
+        if "meta_ifeval" in task_list:
+            get_ifeval_data(model_name,args.work_dir)
+        if "meta_math_hard" in task_list:
+            get_math_data(model_name,args.work_dir)
+    
+def copy_dir(src, dst):
+    try:
+        shutil.copytree(src, dst)
+    except OSError as exc: # python >2.5
+        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
+            shutil.copy(src, dst)
+        else: raise
+def load_config(config_path: str = "./config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+if __name__ == "__main__":
+    args = parse_eval_args()
+    config = load_config(args.config_path)
+    # Create VLLM model args
+    for k,v in config.items():
+        args.__setattr__(k,v)
+    if not os.path.exists(args.template_dir):
+        raise ValueError("The template_dir does not exist, please check the path")
+    if args.evals_dataset not in ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]:
+        raise ValueError("The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection")
+    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
+    # Copy the all files from template folder to the work folder
+    copy_and_prepare(args)
+    prepare_datasets(args)
+    print(f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}")
+    command_str = f"lm_eval --model vllm   --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
+    if args.limit:
+        command_str += f" --limit {args.limit}"
+    if args.log_samples:
+        command_str += f" --log_samples "
+    if args.show_config:
+        command_str += f" --show_config "
+    print("please use the following command to run the meta reproduce evals:")
+    print(command_str)