1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import typing as t
- from bigcode_eval.tasks import humaneval
- from bigcode_eval.tasks.custom_metrics.execute import check_correctness
- from datasets import load_dataset
- from lm_eval.evaluator_utils import eval_logger
- import dspy
- from .datatypes import TaskDatasets
- from .helpers import train_val_test_split
- if t.TYPE_CHECKING:
- from bigcode_eval.base import Task
- def signature(instructions: str = "") -> dspy.Signature:
- class HumanEval(dspy.Signature):
- __doc__ = instructions
- prompt: str = dspy.InputField()
- solution: str = dspy.OutputField()
- return HumanEval
- def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
- program = gold.prompt + "\n" + pred.solution + "\n" + gold.dspy_test
- result = check_correctness(
- program,
- timeout=30,
- task_id=gold.dspy_task_id,
- completion_id=None,
- )
- if result["passed"]:
- return True
- eval_logger.debug(f"{gold.dspy_task_id}: {result['result']}")
- return False
- def datasets(
- train_size: float = 0.1,
- validation_size: float = 0.2,
- ) -> TaskDatasets:
- dataset = load_dataset("codeparrot/instructhumaneval")
- train_docs, validation_docs, test_docs = train_val_test_split(
- dataset,
- train_size=train_size,
- validation_size=validation_size,
- )
- return TaskDatasets(
- trainset=map(_task_doc_example, train_docs),
- valset=map(_task_doc_example, validation_docs),
- testset=map(_task_doc_example, test_docs),
- )
- class TaskDoc(t.TypedDict):
- task_id: str
- prompt: str
- canonical_solution: str
- test: str
- inputs = ["prompt"]
- outputs = ["solution"]
- def _task_doc_example(doc: TaskDoc) -> dspy.Example:
- return dspy.Example(
- prompt=doc["prompt"],
- solution=doc["canonical_solution"],
- # dspy_ keys are hidden
- dspy_task_id=doc["task_id"],
- dspy_test=doc["test"],
- ).with_inputs(*inputs)
|