leaderboard_mmlu_pro.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import typing as t
  2. from datasets import load_dataset
  3. import dspy
  4. from .datatypes import TaskDatasets
  5. from .helpers import train_val_test_split
  6. def signature(instructions: str = "") -> dspy.Signature:
  7. class MMLUPro(dspy.Signature):
  8. __doc__ = instructions
  9. question: str = dspy.InputField()
  10. options: list[str] = dspy.InputField()
  11. answer: str = dspy.OutputField()
  12. return MMLUPro
  13. def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
  14. return gold.answer == pred.answer
  15. def datasets(
  16. train_size: float = 0.1,
  17. validation_size: float = 0.2,
  18. ) -> TaskDatasets:
  19. dataset = load_dataset("TIGER-Lab/MMLU-Pro")
  20. return train_val_test_split(
  21. dataset["test"], _task_doc_example, train_size, validation_size
  22. )
  23. class TaskDoc(t.TypedDict):
  24. question_id: int
  25. question: str
  26. options: list[str]
  27. answer: str
  28. answer_index: int
  29. cot_content: str
  30. category: str
  31. src: str
  32. inputs = ["question", "options"]
  33. outputs = ["answer"]
  34. def _num_letter(n: int) -> str:
  35. return chr(ord("A") + n)
  36. def _task_doc_example(doc: TaskDoc) -> dspy.Example:
  37. question = doc["question"]
  38. options = [f"{_num_letter(i)}. {option}" for i, option in enumerate(doc["options"])]
  39. answer = doc["answer"]
  40. return dspy.Example(
  41. question=question,
  42. options=options,
  43. answer=answer,
  44. ).with_inputs(*inputs)