Переглянути джерело

added updated llama-mmlu-pro and added human-eva

Justin Lee 3 місяців тому
батько
коміт
314b6a874a

+ 77 - 0
end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py

@@ -0,0 +1,77 @@
+import typing as t
+
+from bigcode_eval.tasks import humaneval
+from bigcode_eval.tasks.custom_metrics.execute import check_correctness
+from datasets import load_dataset
+from lm_eval.evaluator_utils import eval_logger
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+if t.TYPE_CHECKING:
+    from bigcode_eval.base import Task
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class HumanEval(dspy.Signature):
+        __doc__ = instructions
+        prompt: str = dspy.InputField()
+        solution: str = dspy.OutputField()
+
+    return HumanEval
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    program = gold.prompt + "\n" + pred.solution + "\n" + gold.dspy_test
+    result = check_correctness(
+        program,
+        timeout=30,
+        task_id=gold.dspy_task_id,
+        completion_id=None,
+    )
+
+    if result["passed"]:
+        return True
+
+    eval_logger.debug(f"{gold.dspy_task_id}: {result['result']}")
+    return False
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("codeparrot/instructhumaneval")
+    train_docs, validation_docs, test_docs = train_val_test_split(
+        dataset,
+        train_size=train_size,
+        validation_size=validation_size,
+    )
+
+    return TaskDatasets(
+        trainset=map(_task_doc_example, train_docs),
+        valset=map(_task_doc_example, validation_docs),
+        testset=map(_task_doc_example, test_docs),
+    )
+
+
+class TaskDoc(t.TypedDict):
+    task_id: str
+    prompt: str
+    canonical_solution: str
+    test: str
+
+
+inputs = ["prompt"]
+outputs = ["solution"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    return dspy.Example(
+        prompt=doc["prompt"],
+        solution=doc["canonical_solution"],
+        # dspy_ keys are hidden
+        dspy_task_id=doc["task_id"],
+        dspy_test=doc["test"],
+    ).with_inputs(*inputs)

+ 45 - 14
end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py

@@ -1,8 +1,9 @@
 import typing as t
 
-from datasets import load_dataset
 import dspy
 
+from datasets import load_dataset
+
 from .datatypes import TaskDatasets
 from .helpers import train_val_test_split
 
@@ -12,34 +13,64 @@ def datasets(
     validation_size: float = 0.2,
 ) -> TaskDatasets:
     """
-    TODO:
     Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
     """
-    dataset = load_dataset("TODO")
-    return train_val_test_split(dataset, _task_doc_example, train_size, validation_size)
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
+    )
+    return train_val_test_split(
+        dataset["latest"],
+        _task_doc_example,
+        train_size,
+        validation_size,
+    )
 
 
 class TaskDoc(t.TypedDict):
-    problem: str
-    gold: str
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[dict]
+    output_choice_negative_log_likelihoods: t.Optional[dict]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
 
 
-inputs = ["problem"]
-outputs = ["answer"]
+inputs = ["input_question", "input_choice_list"]
+outputs = ["output_parsed_answer"]
 
 
 def _task_doc_example(doc: TaskDoc) -> dspy.Example:
-    return dspy.Example(
-        problem=doc["problem"],
-        answer=doc["gold"],
-    ).with_inputs(*inputs)
+    # Create a new Example with the correct field mapping
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+    )
+    # Explicitly set input and output fields
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
 
 
 def signature(instructions: str = "") -> dspy.Signature:
     class MMLUPro(dspy.Signature):
         __doc__ = instructions
-        problem: str = dspy.InputField()
-        answer: str = dspy.OutputField()
+        # Match the field names with what we're using in _task_doc_example
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        answer: str = dspy.OutputField(desc="The correct answer letter")
 
     return MMLUPro
 

+ 1 - 0
end-to-end-use-cases/prompt-migration/environment.yml

@@ -20,3 +20,4 @@ dependencies:
       - transformers
       - openai
       - databricks-sdk
+      - bigcode_eval