|
@@ -1,8 +1,9 @@
|
|
|
import typing as t
|
|
|
|
|
|
-from datasets import load_dataset
|
|
|
import dspy
|
|
|
|
|
|
+from datasets import load_dataset
|
|
|
+
|
|
|
from .datatypes import TaskDatasets
|
|
|
from .helpers import train_val_test_split
|
|
|
|
|
@@ -12,34 +13,64 @@ def datasets(
|
|
|
validation_size: float = 0.2,
|
|
|
) -> TaskDatasets:
|
|
|
"""
|
|
|
- TODO:
|
|
|
Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
|
|
|
"""
|
|
|
- dataset = load_dataset("TODO")
|
|
|
- return train_val_test_split(dataset, _task_doc_example, train_size, validation_size)
|
|
|
+ dataset = load_dataset(
|
|
|
+ "meta-llama/Llama-3.3-70B-Instruct-evals",
|
|
|
+ "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
|
|
|
+ )
|
|
|
+ return train_val_test_split(
|
|
|
+ dataset["latest"],
|
|
|
+ _task_doc_example,
|
|
|
+ train_size,
|
|
|
+ validation_size,
|
|
|
+ )
|
|
|
|
|
|
|
|
|
class TaskDoc(t.TypedDict):
|
|
|
- problem: str
|
|
|
- gold: str
|
|
|
+ task_type: str
|
|
|
+ task_name: str
|
|
|
+ subtask_name: str
|
|
|
+ input_question: str
|
|
|
+ input_choice_list: dict
|
|
|
+ input_final_prompts: list
|
|
|
+ input_correct_responses: list
|
|
|
+ output_prediction_text: list
|
|
|
+ output_parsed_answer: str
|
|
|
+ output_choice_completions: t.Optional[dict]
|
|
|
+ output_choice_negative_log_likelihoods: t.Optional[dict]
|
|
|
+ output_metrics: dict
|
|
|
+ is_correct: bool
|
|
|
+ input_question_hash: str
|
|
|
+ input_final_prompts_hash: list
|
|
|
+ benchmark_label: str
|
|
|
+ eval_config: dict
|
|
|
|
|
|
|
|
|
-inputs = ["problem"]
|
|
|
-outputs = ["answer"]
|
|
|
+inputs = ["input_question", "input_choice_list"]
|
|
|
+outputs = ["output_parsed_answer"]
|
|
|
|
|
|
|
|
|
def _task_doc_example(doc: TaskDoc) -> dspy.Example:
|
|
|
- return dspy.Example(
|
|
|
- problem=doc["problem"],
|
|
|
- answer=doc["gold"],
|
|
|
- ).with_inputs(*inputs)
|
|
|
+ # Create a new Example with the correct field mapping
|
|
|
+ example = dspy.Example(
|
|
|
+ question=doc["input_question"],
|
|
|
+ options=doc["input_choice_list"],
|
|
|
+ answer=doc["output_parsed_answer"],
|
|
|
+ )
|
|
|
+ # Explicitly set input and output fields
|
|
|
+ example._input_keys = {"question", "options"}
|
|
|
+ example._output_keys = {"answer"}
|
|
|
+ return example
|
|
|
|
|
|
|
|
|
def signature(instructions: str = "") -> dspy.Signature:
|
|
|
class MMLUPro(dspy.Signature):
|
|
|
__doc__ = instructions
|
|
|
- problem: str = dspy.InputField()
|
|
|
- answer: str = dspy.OutputField()
|
|
|
+ # Match the field names with what we're using in _task_doc_example
|
|
|
+ question: str = dspy.InputField(desc="The question to be answered")
|
|
|
+ options: dict = dspy.InputField(desc="Dictionary of answer choices")
|
|
|
+ answer: str = dspy.OutputField(desc="The correct answer letter")
|
|
|
|
|
|
return MMLUPro
|
|
|
|