瀏覽代碼

added updated llama-mmlu-pro and added human-eva

Justin Lee 3 月之前
父節點
當前提交
314b6a874a

+ 77 - 0
end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py

@@ -0,0 +1,77 @@
+import typing as t
+
+from bigcode_eval.tasks import humaneval
+from bigcode_eval.tasks.custom_metrics.execute import check_correctness
+from datasets import load_dataset
+from lm_eval.evaluator_utils import eval_logger
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+if t.TYPE_CHECKING:
+    from bigcode_eval.base import Task
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class HumanEval(dspy.Signature):
+        __doc__ = instructions
+        prompt: str = dspy.InputField()
+        solution: str = dspy.OutputField()
+
+    return HumanEval
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    program = gold.prompt + "\n" + pred.solution + "\n" + gold.dspy_test
+    result = check_correctness(
+        program,
+        timeout=30,
+        task_id=gold.dspy_task_id,
+        completion_id=None,
+    )
+
+    if result["passed"]:
+        return True
+
+    eval_logger.debug(f"{gold.dspy_task_id}: {result['result']}")
+    return False
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("codeparrot/instructhumaneval")
+    train_docs, validation_docs, test_docs = train_val_test_split(
+        dataset,
+        train_size=train_size,
+        validation_size=validation_size,
+    )
+
+    return TaskDatasets(
+        trainset=map(_task_doc_example, train_docs),
+        valset=map(_task_doc_example, validation_docs),
+        testset=map(_task_doc_example, test_docs),
+    )
+
+
+class TaskDoc(t.TypedDict):
+    task_id: str
+    prompt: str
+    canonical_solution: str
+    test: str
+
+
+inputs = ["prompt"]
+outputs = ["solution"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    return dspy.Example(
+        prompt=doc["prompt"],
+        solution=doc["canonical_solution"],
+        # dspy_ keys are hidden
+        dspy_task_id=doc["task_id"],
+        dspy_test=doc["test"],
+    ).with_inputs(*inputs)

+ 45 - 14
end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py

@@ -1,8 +1,9 @@
 import typing as t
 import typing as t
 
 
-from datasets import load_dataset
 import dspy
 import dspy
 
 
+from datasets import load_dataset
+
 from .datatypes import TaskDatasets
 from .datatypes import TaskDatasets
 from .helpers import train_val_test_split
 from .helpers import train_val_test_split
 
 
@@ -12,34 +13,64 @@ def datasets(
     validation_size: float = 0.2,
     validation_size: float = 0.2,
 ) -> TaskDatasets:
 ) -> TaskDatasets:
     """
     """
-    TODO:
     Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
     Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
     """
     """
-    dataset = load_dataset("TODO")
-    return train_val_test_split(dataset, _task_doc_example, train_size, validation_size)
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
+    )
+    return train_val_test_split(
+        dataset["latest"],
+        _task_doc_example,
+        train_size,
+        validation_size,
+    )
 
 
 
 
 class TaskDoc(t.TypedDict):
 class TaskDoc(t.TypedDict):
-    problem: str
-    gold: str
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[dict]
+    output_choice_negative_log_likelihoods: t.Optional[dict]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
 
 
 
 
-inputs = ["problem"]
-outputs = ["answer"]
+inputs = ["input_question", "input_choice_list"]
+outputs = ["output_parsed_answer"]
 
 
 
 
 def _task_doc_example(doc: TaskDoc) -> dspy.Example:
 def _task_doc_example(doc: TaskDoc) -> dspy.Example:
-    return dspy.Example(
-        problem=doc["problem"],
-        answer=doc["gold"],
-    ).with_inputs(*inputs)
+    # Create a new Example with the correct field mapping
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+    )
+    # Explicitly set input and output fields
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
 
 
 
 
 def signature(instructions: str = "") -> dspy.Signature:
 def signature(instructions: str = "") -> dspy.Signature:
     class MMLUPro(dspy.Signature):
     class MMLUPro(dspy.Signature):
         __doc__ = instructions
         __doc__ = instructions
-        problem: str = dspy.InputField()
-        answer: str = dspy.OutputField()
+        # Match the field names with what we're using in _task_doc_example
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        answer: str = dspy.OutputField(desc="The correct answer letter")
 
 
     return MMLUPro
     return MMLUPro
 
 

+ 1 - 0
end-to-end-use-cases/prompt-migration/environment.yml

@@ -20,3 +20,4 @@ dependencies:
       - transformers
       - transformers
       - openai
       - openai
       - databricks-sdk
       - databricks-sdk
+      - bigcode_eval