prepare_dataset.py 4.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. from datasets import load_dataset,Dataset
  2. def get_ifeval_data(model_name,output_dir):
  3. if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
  4. raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
  5. original_dataset_name = "wis-k/instruction-following-eval"
  6. #meta_dataset_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-evals"
  7. meta_dataset_name = f"meta-llama/{model_name}-evals"
  8. meta_data = load_dataset(
  9. meta_dataset_name,
  10. name=f"{model_name}-evals__ifeval__strict__details",
  11. split="latest"
  12. )
  13. ifeval_data = load_dataset(
  14. original_dataset_name,
  15. split="train"
  16. )
  17. meta_data = meta_data.map(get_question)
  18. meta_df = meta_data.to_pandas()
  19. ifeval_df = ifeval_data.to_pandas()
  20. ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
  21. print("meta_df",meta_df.columns)
  22. print(meta_df)
  23. print("ifeval_df",ifeval_df.columns)
  24. print(ifeval_df)
  25. joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
  26. joined = joined.rename(columns={"input_final_prompts": "prompt"})
  27. joined = joined.rename(columns={"is_correct": "previous_is_correct"})
  28. joined = Dataset.from_pandas(joined)
  29. joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
  30. joined.rename_column("output_prediction_text","previous_output_prediction_text")
  31. print(joined)
  32. for item in joined:
  33. check_sample(item)
  34. joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
  35. def get_math_data(model_name,output_dir):
  36. if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
  37. raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard")
  38. original_dataset_name = "lighteval/MATH-Hard"
  39. meta_dataset_name = f"meta-llama/{model_name}-evals"
  40. meta_data = load_dataset(
  41. meta_dataset_name,
  42. name=f"{model_name}-evals__math_hard__details",
  43. split="latest"
  44. )
  45. math_data = load_dataset(
  46. original_dataset_name,
  47. split="test"
  48. )
  49. meta_df = meta_data.to_pandas()
  50. math_df = math_data.to_pandas()
  51. math_df = math_df.rename(columns={"problem": "input_question"})
  52. print("meta_df",meta_df.columns)
  53. print(meta_df)
  54. print("math_df",math_df.columns)
  55. print(math_df)
  56. joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
  57. # joined = Dataset.from_pandas(joined)
  58. # joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
  59. # joined = joined.rename_column("is_correct","previous_is_correct")
  60. # joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
  61. print(joined)
  62. # for item in joined:
  63. # check_sample(item)
  64. joined.to_parquet(output_dir + f"/joined_math.parquet")
  65. #joined.save_to_disk(output_dir + f"/joined_math")
  66. def get_question(example):
  67. try:
  68. example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")
  69. example["input_final_prompts"] = example["input_final_prompts"][0]
  70. return example
  71. except:
  72. print(example["input_question"])
  73. return
  74. def check_sample(example):
  75. if "kwargs" in example and not example["kwargs"]:
  76. print(example)
  77. raise ValueError("This example did not got joined for IFeval")
  78. if "solution" in example and not example["solution"]:
  79. print(example)
  80. raise ValueError("This example did not got joined for MATH_hard")