Browse Source

update gitignore, added mmlu 0shot and ran a bunch of test

Justin Lee 2 months ago
parent
commit
e1d64ca2f4

+ 4 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/.gitignore

@@ -0,0 +1,4 @@
+
+
+*.json
+*.jsonl

File diff suppressed because it is too large
+ 1 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py


File diff suppressed because it is too large
+ 30 - 25
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py


+ 3 - 1
end-to-end-use-cases/prompt-migration/.gitignore

@@ -1 +1,3 @@
-/mmlu_pro_data
+/mmlu_pro_data
+
+*.csv

+ 9 - 6
end-to-end-use-cases/prompt-migration/benchmarks/download_mmlu_pro.py

@@ -1,26 +1,29 @@
-from datasets import load_dataset
-import pandas as pd
 import os
 
+import pandas as pd
+from datasets import load_dataset
+
+
 def download_mmlu_pro():
     # Create output directory if it doesn't exist
     output_dir = "mmlu_pro_data"
     os.makedirs(output_dir, exist_ok=True)
-    
+
     # Load the dataset
     dataset = load_dataset("TIGER-Lab/MMLU-Pro")
-    
+
     # Convert each split to CSV
     for split in dataset.keys():
         # Convert to pandas DataFrame
         df = pd.DataFrame(dataset[split])
-        
+
         # Save to CSV
         output_path = os.path.join(output_dir, f"mmlu_pro_{split}.csv")
         df.to_csv(output_path, index=False)
         print(f"Saved {split} split to {output_path}")
         print(f"Number of examples in {split}: {len(df)}")
-        
+
+
 if __name__ == "__main__":
     print("Downloading MMLU-Pro dataset...")
     download_mmlu_pro()

+ 5 - 5
end-to-end-use-cases/prompt-migration/benchmarks/helpers.py

@@ -3,15 +3,15 @@ import typing as t
 from .datatypes import TaskDatasets
 
 if t.TYPE_CHECKING:
-    from datasets import Dataset
     import dspy
+    from datasets import Dataset
 
 
 def train_val_test_split(
     dataset: "Dataset",
     mapper: t.Callable[[dict], "dspy.Example"],
     train_size: float = 0.1,
-    validation_size: float = 0.2,
+    validation_size: float = 0.1,
 ) -> TaskDatasets:
     docs = dataset.train_test_split(train_size=train_size)
     train_docs = docs["train"]
@@ -32,20 +32,20 @@ def fixed_split(
     validation_size: int = 200,
 ) -> TaskDatasets:
     """Split dataset by taking first N examples instead of random sampling.
-    
+
     Args:
         dataset: Input dataset
         mapper: Function to map dataset examples to dspy.Example
         train_size: Number of examples to use for training (default: 1000)
         validation_size: Number of examples to use for validation (default: 200)
-    
+
     Returns:
         TaskDatasets containing train, validation and test splits
     """
     train_docs = dataset.select(range(train_size))
     validation_docs = dataset.select(range(train_size, train_size + validation_size))
     test_docs = dataset.select(range(train_size + validation_size, len(dataset)))
-    
+
     return TaskDatasets(
         trainset=list(map(mapper, train_docs)),
         valset=list(map(mapper, validation_docs)),

+ 76 - 0
end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu.py

@@ -0,0 +1,76 @@
+import typing as t
+
+import dspy
+
+from datasets import load_dataset
+
+from .datatypes import TaskDatasets
+from .helpers import fixed_split, train_val_test_split
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.1,
+) -> TaskDatasets:
+    """
+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
+    """
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu__0_shot__cot__details",
+    )
+    return train_val_test_split(dataset["latest"], _task_doc_example)
+
+
+class TaskDoc(t.TypedDict):
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[int]
+    output_choice_negative_log_likelihoods: t.Optional[int]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    # Get reasoning from output_prediction_text if available
+    # reasoning = (
+    #     doc["output_prediction_text"][0] if doc.get("output_prediction_text") else ""
+    # )
+
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+        # reasoning=reasoning,
+    )
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLU(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        # reasoning: str = dspy.OutputField(
+        #     desc="Step-by-step reasoning for arriving at the answer"
+        # )
+        answer: str = dspy.OutputField(desc="The correct answer letter")
+
+    return MMLU
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer  # Keep focusing on answer accuracy

File diff suppressed because it is too large
+ 3193 - 272
end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb