123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- # Copyright (c) Meta Platforms, Inc. and affiliates.
- # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
- import argparse
- import errno
- import glob
- import os
- import shutil
- from pathlib import Path
- import nltk
- import yaml
- from datasets import Dataset, load_dataset
- LLAMA_3_1_INSTRUCT_EVALS = [
- "meta-llama/Llama-3.1-8B-Instruct-evals",
- "meta-llama/Llama-3.1-70B-Instruct-evals",
- "meta-llama/Llama-3.1-405B-Instruct-evals",
- ]
- LLAMA_3_1_PRETRAIN_EVALS = [
- "meta-llama/Llama-3.1-8B-evals",
- "meta-llama/Llama-3.1-70B-evals",
- "meta-llama/Llama-3.1-405B-evals",
- ]
- LLAMA_3_2_INSTRUCT_EVALS = [
- "meta-llama/Llama-3.2-1B-Instruct-evals",
- "meta-llama/Llama-3.2-3B-Instruct-evals",
- ]
- LLAMA_3_2_PRETRAIN_EVALS = [
- "meta-llama/Llama-3.2-1B-evals",
- "meta-llama/Llama-3.2-3B-evals",
- ]
- # get the ifeval from the evals dataset and join it with the original ifeval datasets
- def get_ifeval_data(model_name, output_dir):
- print(f"preparing the ifeval data using {model_name}'s evals dataset")
- if model_name not in [
- "Llama-3.1-8B-Instruct",
- "Llama-3.1-70B-Instruct",
- "Llama-3.1-405B-Instruct",
- "Llama-3.3-70B-Instruct",
- ]:
- raise ValueError(
- "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
- )
- original_dataset_name = "wis-k/instruction-following-eval"
- meta_dataset_name = f"meta-llama/{model_name}-evals"
- meta_data = load_dataset(
- meta_dataset_name,
- name=f"{model_name}-evals__ifeval__strict__details",
- split="latest",
- )
- ifeval_data = load_dataset(original_dataset_name, split="train")
- meta_data = meta_data.map(get_question)
- meta_df = meta_data.to_pandas()
- ifeval_df = ifeval_data.to_pandas()
- ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
- # join the two datasets on the input_question column
- joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
- joined = joined.rename(columns={"input_final_prompts": "prompt"})
- joined = joined.rename(columns={"is_correct": "previous_is_correct"})
- joined = Dataset.from_pandas(joined)
- joined = joined.select_columns(
- [
- "input_question",
- "prompt",
- "previous_is_correct",
- "instruction_id_list",
- "kwargs",
- "output_prediction_text",
- "key",
- ]
- )
- joined.rename_column("output_prediction_text", "previous_output_prediction_text")
- joined.to_parquet(output_dir + "/joined_ifeval.parquet")
- # get the math_hard data from the evals dataset and join it with the original math_hard dataset
- def get_math_hard_data(model_name, output_dir):
- print(f"preparing the math hard data using {model_name}'s evals dataset")
- if model_name not in [
- "Llama-3.1-8B-Instruct",
- "Llama-3.1-70B-Instruct",
- "Llama-3.1-405B-Instruct",
- "Llama-3.3-70B-Instruct",
- ]:
- raise ValueError(
- "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
- )
- original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
- meta_dataset_name = f"meta-llama/{model_name}-evals"
- meta_data = load_dataset(
- meta_dataset_name,
- name=f"{model_name}-evals__math_hard__details",
- split="latest",
- )
- math_data = load_dataset(original_dataset_name, split="test")
- joined = join_meta_and_original_math_data(meta_data, math_data)
- joined.to_parquet(output_dir + "/joined_math_hard.parquet")
- def get_math_data(model_name, output_dir):
- print(f"preparing the math data using {model_name}'s evals dataset")
- if model_name not in [
- "Llama-3.2-1B-Instruct",
- "Llama-3.2-3B-Instruct",
- ]:
- raise ValueError(
- "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
- )
- original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
- meta_dataset_name = f"meta-llama/{model_name}-evals"
- meta_data = load_dataset(
- meta_dataset_name,
- name=f"{model_name}-evals__math__details",
- split="latest",
- )
- math_data = load_dataset(original_dataset_name, split="test")
- joined = join_meta_and_original_math_data(meta_data, math_data)
- joined.to_parquet(output_dir + "/joined_math.parquet")
- def join_meta_and_original_math_data(meta_data, math_data):
- meta_df = meta_data.to_pandas()
- math_df = math_data.to_pandas()
- math_df = math_df.rename(columns={"problem": "input_question"})
- # join the two datasets on the input_question column
- joined = meta_df.join(math_df.set_index("input_question"), on="input_question")
- joined = Dataset.from_pandas(joined)
- joined = joined.select_columns(
- [
- "input_question",
- "input_correct_responses",
- "input_final_prompts",
- "is_correct",
- "solution",
- "output_prediction_text",
- ]
- )
- joined = joined.rename_column("is_correct", "previous_is_correct")
- joined = joined.rename_column(
- "output_prediction_text", "previous_output_prediction_text"
- )
- return joined
- # get the question from the ifeval dataset
- def get_question(example):
- try:
- example["input_question"] = (
- eval(
- example["input_question"]
- .replace("null", "None")
- .replace("true", "True")
- .replace("false", "False")
- )["dialog"][0]["body"]
- .replace("Is it True that the first song", "Is it true that the first song")
- .replace("Is the following True", "Is the following true")
- )
- example["input_final_prompts"] = example["input_final_prompts"][0]
- return example
- except:
- print(example["input_question"])
- return
- # change the yaml file to use the correct model name
- def change_yaml(args, base_name):
- for yaml_file in glob.glob(args.template_dir + "**/*/*.yaml", recursive=True):
- with open(yaml_file, "r") as sources:
- lines = sources.readlines()
- output_path = yaml_file.replace(args.template_dir, args.work_dir)
- print(f"changing {yaml_file} to output_path: {output_path}")
- path = Path(output_path)
- yaml_dir = path.parent
- with open(output_path, "w") as output:
- for line in lines:
- output.write(
- line.replace("Llama-3.1-8B", base_name).replace(
- "WORK_DIR", str(yaml_dir)
- )
- )
- # 3.2 evals dataset has a differents set of tasks from 3.1
- # Update tasks in meta_pretrain.yaml
- with open(args.template_dir + "/meta_pretrain.yaml", "r") as yaml_file:
- meta_pretrain = yaml.safe_load(yaml_file)
- if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
- meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
- elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
- meta_pretrain["task"] = ["meta_mmlu_pretrain"]
- with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
- yaml.dump(meta_pretrain, yaml_file)
- # Update tasks in meta_instruct.yaml
- with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
- meta_instruct = yaml.safe_load(yaml_file)
- if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
- meta_instruct["task"] = [
- "meta_ifeval",
- "meta_math_hard",
- "meta_gpqa_cot",
- "meta_mmlu_pro_instruct",
- ]
- elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
- meta_instruct["task"] = ["meta_mmlu_instruct", "meta_math", "meta_gpqa"]
- with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
- yaml.dump(meta_instruct, yaml_file)
- # copy the files and change the yaml file to use the correct model name
- def copy_and_prepare(args):
- # nltk punkt_tab package is needed
- nltk.download("punkt_tab")
- copy_dir(args.template_dir, args.work_dir)
- # Use the template yaml to get the correct model name in work_dir yaml
- base_name = (
- args.evals_dataset.split("/")[-1].replace("-evals", "").replace("-Instruct", "")
- )
- change_yaml(args, base_name)
- def parse_eval_args():
- parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
- parser.add_argument(
- "--config_path",
- type=str,
- default="./eval_config.yaml",
- help="the config yaml file that contains all the eval parameters",
- )
- return parser.parse_args()
- def prepare_datasets(args):
- # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
- # model_name are derived from the evals_dataset name
- task_list = args.tasks.split(",")
- model_name = args.evals_dataset.split("/")[-1].replace("-evals", "")
- if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
- get_ifeval_data(model_name, args.work_dir)
- get_math_hard_data(model_name, args.work_dir)
- elif (
- "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
- ):
- get_math_data(model_name, args.work_dir)
- else:
- if "meta_ifeval" in task_list:
- get_ifeval_data(model_name, args.work_dir)
- if "meta_math_hard" in task_list:
- get_math_hard_data(model_name, args.work_dir)
- # copy the files from src to dst
- def copy_dir(src, dst):
- try:
- shutil.copytree(src, dst, dirs_exist_ok=True)
- except OSError as exc: # python >2.5
- if exc.errno in (errno.ENOTDIR, errno.EINVAL):
- shutil.copy(src, dst)
- else:
- raise
- # load the config yaml file
- def load_config(config_path: str = "./config.yaml"):
- # Read the YAML configuration file
- with open(config_path, "r") as file:
- config = yaml.safe_load(file)
- return config
- if __name__ == "__main__":
- args = parse_eval_args()
- config = load_config(args.config_path)
- # Create VLLM model args
- for k, v in config.items():
- args.__setattr__(k, v)
- if not os.path.exists(args.template_dir):
- raise ValueError("The template_dir does not exist, please check the path")
- if args.evals_dataset not in (
- LLAMA_3_1_INSTRUCT_EVALS
- + LLAMA_3_1_PRETRAIN_EVALS
- + LLAMA_3_2_INSTRUCT_EVALS
- + LLAMA_3_2_PRETRAIN_EVALS
- ):
- raise ValueError(
- "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
- )
- args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
- # Copy the all files from template folder to the work folder
- copy_and_prepare(args)
- # Prepare the datasets for the IFeval and MATH_Hard tasks as we need to join the original dataset
- prepare_datasets(args)
- print(
- f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}"
- )
- command_str = f"lm_eval --model vllm --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
- if args.limit:
- command_str += f" --limit {args.limit}"
- if args.log_samples:
- command_str += " --log_samples "
- if args.show_config:
- command_str += " --show_config "
- print("please use the following command to run the meta reproduce evals:")
- print(command_str)
|