prepare_meta_eval.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
  3. import argparse
  4. import errno
  5. import glob
  6. import os
  7. import shutil
  8. from pathlib import Path
  9. import nltk
  10. import yaml
  11. from datasets import Dataset, load_dataset
  12. LLAMA_3_1_INSTRUCT_EVALS = [
  13. "meta-llama/Llama-3.1-8B-Instruct-evals",
  14. "meta-llama/Llama-3.1-70B-Instruct-evals",
  15. "meta-llama/Llama-3.1-405B-Instruct-evals",
  16. ]
  17. LLAMA_3_1_PRETRAIN_EVALS = [
  18. "meta-llama/Llama-3.1-8B-evals",
  19. "meta-llama/Llama-3.1-70B-evals",
  20. "meta-llama/Llama-3.1-405B-evals",
  21. ]
  22. LLAMA_3_2_INSTRUCT_EVALS = [
  23. "meta-llama/Llama-3.2-1B-Instruct-evals",
  24. "meta-llama/Llama-3.2-3B-Instruct-evals",
  25. ]
  26. LLAMA_3_2_PRETRAIN_EVALS = [
  27. "meta-llama/Llama-3.2-1B-evals",
  28. "meta-llama/Llama-3.2-3B-evals",
  29. ]
  30. # get the ifeval from the evals dataset and join it with the original ifeval datasets
  31. def get_ifeval_data(model_name, output_dir):
  32. print(f"preparing the ifeval data using {model_name}'s evals dataset")
  33. if model_name not in [
  34. "Llama-3.1-8B-Instruct",
  35. "Llama-3.1-70B-Instruct",
  36. "Llama-3.1-405B-Instruct",
  37. "Llama-3.3-70B-Instruct",
  38. ]:
  39. raise ValueError(
  40. "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for IFEval"
  41. )
  42. original_dataset_name = "wis-k/instruction-following-eval"
  43. meta_dataset_name = f"meta-llama/{model_name}-evals"
  44. meta_data = load_dataset(
  45. meta_dataset_name,
  46. name=f"{model_name}-evals__ifeval__strict__details",
  47. split="latest",
  48. )
  49. ifeval_data = load_dataset(original_dataset_name, split="train")
  50. meta_data = meta_data.map(get_question)
  51. meta_df = meta_data.to_pandas()
  52. ifeval_df = ifeval_data.to_pandas()
  53. ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
  54. # join the two datasets on the input_question column
  55. joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
  56. joined = joined.rename(columns={"input_final_prompts": "prompt"})
  57. joined = joined.rename(columns={"is_correct": "previous_is_correct"})
  58. joined = Dataset.from_pandas(joined)
  59. joined = joined.select_columns(
  60. [
  61. "input_question",
  62. "prompt",
  63. "previous_is_correct",
  64. "instruction_id_list",
  65. "kwargs",
  66. "output_prediction_text",
  67. "key",
  68. ]
  69. )
  70. joined.rename_column("output_prediction_text", "previous_output_prediction_text")
  71. joined.to_parquet(output_dir + "/joined_ifeval.parquet")
  72. # get the math_hard data from the evals dataset and join it with the original math_hard dataset
  73. def get_math_hard_data(model_name, output_dir):
  74. print(f"preparing the math hard data using {model_name}'s evals dataset")
  75. if model_name not in [
  76. "Llama-3.1-8B-Instruct",
  77. "Llama-3.1-70B-Instruct",
  78. "Llama-3.1-405B-Instruct",
  79. "Llama-3.3-70B-Instruct",
  80. ]:
  81. raise ValueError(
  82. "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct, Llama-3.3-70B-Instruct models are supported for MATH_hard"
  83. )
  84. original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
  85. meta_dataset_name = f"meta-llama/{model_name}-evals"
  86. meta_data = load_dataset(
  87. meta_dataset_name,
  88. name=f"{model_name}-evals__math_hard__details",
  89. split="latest",
  90. )
  91. math_data = load_dataset(original_dataset_name, split="test")
  92. joined = join_meta_and_original_math_data(meta_data, math_data)
  93. joined.to_parquet(output_dir + "/joined_math_hard.parquet")
  94. def get_math_data(model_name, output_dir):
  95. print(f"preparing the math data using {model_name}'s evals dataset")
  96. if model_name not in [
  97. "Llama-3.2-1B-Instruct",
  98. "Llama-3.2-3B-Instruct",
  99. ]:
  100. raise ValueError(
  101. "Only Llama-3.2-1B-Instruct and Llama-3.2-3B-Instruct models are supported for MATH"
  102. )
  103. original_dataset_name = "DigitalLearningGmbH/MATH-lighteval"
  104. meta_dataset_name = f"meta-llama/{model_name}-evals"
  105. meta_data = load_dataset(
  106. meta_dataset_name,
  107. name=f"{model_name}-evals__math__details",
  108. split="latest",
  109. )
  110. math_data = load_dataset(original_dataset_name, split="test")
  111. joined = join_meta_and_original_math_data(meta_data, math_data)
  112. joined.to_parquet(output_dir + "/joined_math.parquet")
  113. def join_meta_and_original_math_data(meta_data, math_data):
  114. meta_df = meta_data.to_pandas()
  115. math_df = math_data.to_pandas()
  116. math_df = math_df.rename(columns={"problem": "input_question"})
  117. # join the two datasets on the input_question column
  118. joined = meta_df.join(math_df.set_index("input_question"), on="input_question")
  119. joined = Dataset.from_pandas(joined)
  120. joined = joined.select_columns(
  121. [
  122. "input_question",
  123. "input_correct_responses",
  124. "input_final_prompts",
  125. "is_correct",
  126. "solution",
  127. "output_prediction_text",
  128. ]
  129. )
  130. joined = joined.rename_column("is_correct", "previous_is_correct")
  131. joined = joined.rename_column(
  132. "output_prediction_text", "previous_output_prediction_text"
  133. )
  134. return joined
  135. # get the question from the ifeval dataset
  136. def get_question(example):
  137. try:
  138. example["input_question"] = (
  139. eval(
  140. example["input_question"]
  141. .replace("null", "None")
  142. .replace("true", "True")
  143. .replace("false", "False")
  144. )["dialog"][0]["body"]
  145. .replace("Is it True that the first song", "Is it true that the first song")
  146. .replace("Is the following True", "Is the following true")
  147. )
  148. example["input_final_prompts"] = example["input_final_prompts"][0]
  149. return example
  150. except:
  151. print(example["input_question"])
  152. return
  153. # change the yaml file to use the correct model name
  154. def change_yaml(args, base_name):
  155. for yaml_file in glob.glob(args.template_dir + "**/*/*.yaml", recursive=True):
  156. with open(yaml_file, "r") as sources:
  157. lines = sources.readlines()
  158. output_path = yaml_file.replace(args.template_dir, args.work_dir)
  159. print(f"changing {yaml_file} to output_path: {output_path}")
  160. path = Path(output_path)
  161. yaml_dir = path.parent
  162. with open(output_path, "w") as output:
  163. for line in lines:
  164. output.write(
  165. line.replace("Llama-3.1-8B", base_name).replace(
  166. "WORK_DIR", str(yaml_dir)
  167. )
  168. )
  169. # 3.2 evals dataset has a differents set of tasks from 3.1
  170. # Update tasks in meta_pretrain.yaml
  171. with open(args.template_dir + "/meta_pretrain.yaml", "r") as yaml_file:
  172. meta_pretrain = yaml.safe_load(yaml_file)
  173. if args.evals_dataset in LLAMA_3_1_PRETRAIN_EVALS:
  174. meta_pretrain["task"] = ["meta_bbh", "meta_mmlu_pro_pretrain"]
  175. elif args.evals_dataset in LLAMA_3_2_PRETRAIN_EVALS:
  176. meta_pretrain["task"] = ["meta_mmlu_pretrain"]
  177. with open(args.work_dir + "/meta_pretrain.yaml", "w") as yaml_file:
  178. yaml.dump(meta_pretrain, yaml_file)
  179. # Update tasks in meta_instruct.yaml
  180. with open(args.template_dir + "/meta_instruct.yaml", "r") as yaml_file:
  181. meta_instruct = yaml.safe_load(yaml_file)
  182. if args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
  183. meta_instruct["task"] = [
  184. "meta_ifeval",
  185. "meta_math_hard",
  186. "meta_gpqa_cot",
  187. "meta_mmlu_pro_instruct",
  188. ]
  189. elif args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS:
  190. meta_instruct["task"] = ["meta_mmlu_instruct", "meta_math", "meta_gpqa"]
  191. with open(args.work_dir + "/meta_instruct.yaml", "w") as yaml_file:
  192. yaml.dump(meta_instruct, yaml_file)
  193. # copy the files and change the yaml file to use the correct model name
  194. def copy_and_prepare(args):
  195. # nltk punkt_tab package is needed
  196. nltk.download("punkt_tab")
  197. copy_dir(args.template_dir, args.work_dir)
  198. # Use the template yaml to get the correct model name in work_dir yaml
  199. base_name = (
  200. args.evals_dataset.split("/")[-1].replace("-evals", "").replace("-Instruct", "")
  201. )
  202. change_yaml(args, base_name)
  203. def parse_eval_args():
  204. parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
  205. parser.add_argument(
  206. "--config_path",
  207. type=str,
  208. default="./eval_config.yaml",
  209. help="the config yaml file that contains all the eval parameters",
  210. )
  211. return parser.parse_args()
  212. def prepare_datasets(args):
  213. # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
  214. # model_name are derived from the evals_dataset name
  215. task_list = args.tasks.split(",")
  216. model_name = args.evals_dataset.split("/")[-1].replace("-evals", "")
  217. if "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_1_INSTRUCT_EVALS:
  218. get_ifeval_data(model_name, args.work_dir)
  219. get_math_hard_data(model_name, args.work_dir)
  220. elif (
  221. "meta_instruct" in task_list and args.evals_dataset in LLAMA_3_2_INSTRUCT_EVALS
  222. ):
  223. get_math_data(model_name, args.work_dir)
  224. else:
  225. if "meta_ifeval" in task_list:
  226. get_ifeval_data(model_name, args.work_dir)
  227. if "meta_math_hard" in task_list:
  228. get_math_hard_data(model_name, args.work_dir)
  229. # copy the files from src to dst
  230. def copy_dir(src, dst):
  231. try:
  232. shutil.copytree(src, dst, dirs_exist_ok=True)
  233. except OSError as exc: # python >2.5
  234. if exc.errno in (errno.ENOTDIR, errno.EINVAL):
  235. shutil.copy(src, dst)
  236. else:
  237. raise
  238. # load the config yaml file
  239. def load_config(config_path: str = "./config.yaml"):
  240. # Read the YAML configuration file
  241. with open(config_path, "r") as file:
  242. config = yaml.safe_load(file)
  243. return config
  244. if __name__ == "__main__":
  245. args = parse_eval_args()
  246. config = load_config(args.config_path)
  247. # Create VLLM model args
  248. for k, v in config.items():
  249. args.__setattr__(k, v)
  250. if not os.path.exists(args.template_dir):
  251. raise ValueError("The template_dir does not exist, please check the path")
  252. if args.evals_dataset not in (
  253. LLAMA_3_1_INSTRUCT_EVALS
  254. + LLAMA_3_1_PRETRAIN_EVALS
  255. + LLAMA_3_2_INSTRUCT_EVALS
  256. + LLAMA_3_2_PRETRAIN_EVALS
  257. ):
  258. raise ValueError(
  259. "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 or 3.2 Evals collection."
  260. )
  261. args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
  262. # Copy the all files from template folder to the work folder
  263. copy_and_prepare(args)
  264. # Prepare the datasets for the IFeval and MATH_Hard tasks as we need to join the original dataset
  265. prepare_datasets(args)
  266. print(
  267. f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}"
  268. )
  269. command_str = f"lm_eval --model vllm --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
  270. if args.limit:
  271. command_str += f" --limit {args.limit}"
  272. if args.log_samples:
  273. command_str += " --log_samples "
  274. if args.show_config:
  275. command_str += " --show_config "
  276. print("please use the following command to run the meta reproduce evals:")
  277. print(command_str)