пре 1 година · dc406b4769
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,9 +1,9 @@
 
				-model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				+model_name: "meta-llama/Llama-3.3-70B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
			
 
				+evals_dataset: "meta-llama/Llama-3.1-70B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
			
 
				 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
			
 
				 
			
 
				-tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+tasks: "meta_mmlu_pro_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				 # Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml
@@ -0,0 +1,28 @@
 
				+dataset_path: meta-llama/Llama-3.1-70B-evals
			
 
				+dataset_name: Llama-3.1-70B-evals__bbh__details
			
 
				+task: meta_bbh
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: answer
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        regex_pattern: 'the answer is (.*?)\.'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: "\n\nQ: "
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py
@@ -0,0 +1,21 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "answer": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml
@@ -0,0 +1,29 @@
 
				+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
			
 
				+task: meta_gpqa
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: ' ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 2048
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py
@@ -0,0 +1,19 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml
@@ -0,0 +1,29 @@
 
				+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
			
 
				+task: meta_gpqa_cot
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+test_split: latest
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: 'best answer is ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 2048
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py
@@ -0,0 +1,20 @@
 
				+import random
			
 
				+import re
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
 
				+    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
 
				+task: meta_ifeval
			
 
				+dataset_path: parquet
			
 
				+dataset_kwargs:
			
 
				+  data_files: ./work_dir/joined_ifeval.parquet
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+num_fewshot: 0
			
 
				+doc_to_text: prompt
			
 
				+doc_to_target: 0
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0.0
			
 
				+  max_gen_toks: 1280
			
 
				+process_results: !function utils.process_results
			
 
				+metric_list:
			
 
				+  - metric: prompt_level_strict_acc
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+  - metric: inst_level_strict_acc
			
 
				+    aggregation: !function utils.agg_inst_level_acc
			
 
				+    higher_is_better: true
			
 
				+  - metric: prompt_level_loose_acc
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+  - metric: inst_level_loose_acc
			
 
				+    aggregation: !function utils.agg_inst_level_acc
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 2.0
			
 
				+fewshot_config:
			
 
				+  sampler: first_n
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py
@@ -0,0 +1,139 @@
 
				+import dataclasses
			
 
				+from typing import Dict, Optional, Union
			
 
				+
			
 
				+from lm_eval.tasks.ifeval import instructions_registry
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class InputExample:
			
 
				+    key: int
			
 
				+    instruction_id_list: list[str]
			
 
				+    prompt: str
			
 
				+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class OutputExample:
			
 
				+    instruction_id_list: list[str]
			
 
				+    prompt: str
			
 
				+    response: str
			
 
				+    follow_all_instructions: bool
			
 
				+    follow_instruction_list: list[bool]
			
 
				+
			
 
				+
			
 
				+def test_instruction_following_strict(
			
 
				+    inp,
			
 
				+    response,
			
 
				+):
			
 
				+    """Tests response to see if instructions are followed."""
			
 
				+    instruction_list = inp.instruction_id_list
			
 
				+    is_following_list = []
			
 
				+
			
 
				+    for index, instruction_id in enumerate(instruction_list):
			
 
				+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
			
 
				+        instruction = instruction_cls(instruction_id)
			
 
				+                
			
 
				+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
			
 
				+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
			
 
				+        instruction.build_description(**kwargs)
			
 
				+        args = instruction.get_instruction_args()
			
 
				+        if args and "prompt" in args:
			
 
				+            instruction.build_description(prompt=inp.prompt)
			
 
				+
			
 
				+        if response.strip() and instruction.check_following(response):
			
 
				+            is_following_list.append(True)
			
 
				+        else:
			
 
				+            is_following_list.append(False)
			
 
				+
			
 
				+    return OutputExample(
			
 
				+        instruction_id_list=inp.instruction_id_list,
			
 
				+        prompt=inp.prompt,
			
 
				+        response=response,
			
 
				+        follow_all_instructions=all(is_following_list),
			
 
				+        follow_instruction_list=is_following_list,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def test_instruction_following_loose(
			
 
				+    inp,
			
 
				+    response,
			
 
				+):
			
 
				+    """Tests response for an upper bound for following instructions."""
			
 
				+    r = response.split("\n")
			
 
				+    response_remove_first = "\n".join(r[1:]).strip()
			
 
				+    response_remove_last = "\n".join(r[:-1]).strip()
			
 
				+    response_remove_both = "\n".join(r[1:-1]).strip()
			
 
				+    revised_response = response.replace("*", "")
			
 
				+    revised_response_remove_first = response_remove_first.replace("*", "")
			
 
				+    revised_response_remove_last = response_remove_last.replace("*", "")
			
 
				+    revised_response_remove_both = response_remove_both.replace("*", "")
			
 
				+    all_responses = [
			
 
				+        response,
			
 
				+        revised_response,
			
 
				+        response_remove_first,
			
 
				+        response_remove_last,
			
 
				+        response_remove_both,
			
 
				+        revised_response_remove_first,
			
 
				+        revised_response_remove_last,
			
 
				+        revised_response_remove_both,
			
 
				+    ]
			
 
				+    instruction_list = inp.instruction_id_list
			
 
				+    is_following_list = []
			
 
				+
			
 
				+    for index, instruction_id in enumerate(instruction_list):
			
 
				+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
			
 
				+        instruction = instruction_cls(instruction_id)
			
 
				+
			
 
				+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
			
 
				+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
			
 
				+        instruction.build_description(**kwargs)
			
 
				+        args = instruction.get_instruction_args()
			
 
				+        if args and "prompt" in args:
			
 
				+            instruction.build_description(prompt=inp.prompt)
			
 
				+
			
 
				+        is_following = False
			
 
				+        for r in all_responses:
			
 
				+            if r.strip() and instruction.check_following(r):
			
 
				+                is_following = True
			
 
				+                break
			
 
				+
			
 
				+        is_following_list.append(is_following)
			
 
				+
			
 
				+    return OutputExample(
			
 
				+        instruction_id_list=inp.instruction_id_list,
			
 
				+        prompt=inp.prompt,
			
 
				+        response=response,
			
 
				+        follow_all_instructions=all(is_following_list),
			
 
				+        follow_instruction_list=is_following_list,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def process_results(doc, results):
			
 
				+    new_kwargs = []
			
 
				+    for item in doc["kwargs"]:
			
 
				+        if item["nth_paragraph"]:
			
 
				+            item["nth_paragraph"] = int(item["nth_paragraph"])
			
 
				+        new_kwargs.append(item)
			
 
				+    inp = InputExample(
			
 
				+        key=doc["key"],
			
 
				+        instruction_id_list=doc["instruction_id_list"],
			
 
				+        prompt=doc["prompt"],
			
 
				+        kwargs=new_kwargs,
			
 
				+    )
			
 
				+    response = results[0]
			
 
				+
			
 
				+    out_strict = test_instruction_following_strict(inp, response)
			
 
				+    out_loose = test_instruction_following_loose(inp, response)
			
 
				+
			
 
				+    return {
			
 
				+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
			
 
				+        "inst_level_strict_acc": out_strict.follow_instruction_list,
			
 
				+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
			
 
				+        "inst_level_loose_acc": out_loose.follow_instruction_list,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def agg_inst_level_acc(items):
			
 
				+    flat_items = [item for sublist in items for item in sublist]
			
 
				+    inst_level_acc = sum(flat_items) / len(flat_items)
			
 
				+    return inst_level_acc
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml
@@ -0,0 +1,21 @@
 
				+dataset_path: parquet
			
 
				+dataset_kwargs:
			
 
				+  data_files: ./work_dir/joined_math.parquet
			
 
				+task: meta_math
			
 
				+process_docs: !function utils.process_docs
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+doc_to_text:  !function utils.doc_to_text
			
 
				+process_results: !function utils.process_results
			
 
				+doc_to_target: answer
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml
@@ -0,0 +1,21 @@
 
				+dataset_path: parquet
			
 
				+dataset_kwargs:
			
 
				+  data_files: ./work_dir/joined_math_hard.parquet
			
 
				+task: meta_math_hard
			
 
				+process_docs: !function utils.process_docs
			
 
				+output_type: generate_until
			
 
				+test_split: train
			
 
				+doc_to_text:  !function utils.doc_to_text
			
 
				+process_results: !function utils.process_results
			
 
				+doc_to_target: answer
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 5120
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py
@@ -0,0 +1,268 @@
 
				+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
			
 
				+import re
			
 
				+import signal
			
 
				+from typing import Dict, List, Optional
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+from lm_eval.utils import eval_logger
			
 
				+
			
 
				+
			
 
				+try:
			
 
				+    import sympy
			
 
				+    from sympy.parsing.latex import parse_latex
			
 
				+except ModuleNotFoundError:
			
 
				+    raise ModuleNotFoundError(
			
 
				+        "`sympy` is required for generating translation task prompt templates. \
			
 
				+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
			
 
				+    )
			
 
				+
			
 
				+# taken from
			
 
				+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    return doc["input_final_prompts"][0]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "answer": normalize_final_answer(
			
 
				+                 remove_boxed(last_boxed_only_string(doc["solution"]))
			
 
				+            ),
			
 
				+            "meta_target": doc["input_correct_responses"]
			
 
				+        }
			
 
				+        return out_doc
			
 
				+    return dataset.map(_process_doc)
			
 
				+
			
 
				+
			
 
				+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
			
 
				+    candidates = results[0]
			
 
				+    last_boxed_string = last_boxed_only_string(candidates)
			
 
				+    if not last_boxed_string:
			
 
				+        # No boxed string found, so we can't evaluate
			
 
				+        return {"exact_match": 0}
			
 
				+    unnormalized_answer = remove_boxed(last_boxed_string)
			
 
				+    answer = normalize_final_answer(unnormalized_answer)
			
 
				+
			
 
				+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
			
 
				+        retval = 1
			
 
				+    else:
			
 
				+        retval = 0
			
 
				+
			
 
				+    results = {
			
 
				+        "exact_match": retval,
			
 
				+    }
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def last_boxed_only_string(string: str) -> Optional[str]:
			
 
				+    idx = string.rfind("\\boxed")
			
 
				+    if "\\boxed " in string:
			
 
				+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
			
 
				+    if idx < 0:
			
 
				+        idx = string.rfind("\\fbox")
			
 
				+        if idx < 0:
			
 
				+            return None
			
 
				+
			
 
				+    i = idx
			
 
				+    right_brace_idx = None
			
 
				+    num_left_braces_open = 0
			
 
				+    while i < len(string):
			
 
				+        if string[i] == "{":
			
 
				+            num_left_braces_open += 1
			
 
				+        if string[i] == "}":
			
 
				+            num_left_braces_open -= 1
			
 
				+            if num_left_braces_open == 0:
			
 
				+                right_brace_idx = i
			
 
				+                break
			
 
				+        i += 1
			
 
				+
			
 
				+    if right_brace_idx is None:
			
 
				+        retval = None
			
 
				+    else:
			
 
				+        retval = string[idx : right_brace_idx + 1]
			
 
				+
			
 
				+    return retval
			
 
				+
			
 
				+
			
 
				+def remove_boxed(s: str) -> str:
			
 
				+    if "\\boxed " in s:
			
 
				+        left = "\\boxed "
			
 
				+        assert s[: len(left)] == left
			
 
				+        return s[len(left) :]
			
 
				+
			
 
				+    left = "\\boxed{"
			
 
				+
			
 
				+    assert s[: len(left)] == left
			
 
				+    assert s[-1] == "}"
			
 
				+
			
 
				+    return s[len(left) : -1]
			
 
				+
			
 
				+
			
 
				+class timeout:
			
 
				+    def __init__(self, seconds=1, error_message="Timeout"):
			
 
				+        self.seconds = seconds
			
 
				+        self.error_message = error_message
			
 
				+
			
 
				+    def handle_timeout(self, signum, frame):
			
 
				+        raise TimeoutError(self.error_message)
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        signal.signal(signal.SIGALRM, self.handle_timeout)
			
 
				+        signal.alarm(self.seconds)
			
 
				+
			
 
				+    def __exit__(self, type, value, traceback):
			
 
				+        signal.alarm(0)
			
 
				+
			
 
				+
			
 
				+def is_equiv(x1: str, x2: str) -> bool:
			
 
				+    """
			
 
				+    x1 and x2 are normalized latex string
			
 
				+    """
			
 
				+    try:
			
 
				+        with timeout(seconds=5):
			
 
				+            try:
			
 
				+                parsed_x1 = parse_latex(x1)
			
 
				+                parsed_x2 = parse_latex(x2)
			
 
				+            except (
			
 
				+                sympy.parsing.latex.errors.LaTeXParsingError,
			
 
				+                sympy.SympifyError,
			
 
				+                TypeError,
			
 
				+            ):
			
 
				+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
			
 
				+                return False
			
 
				+
			
 
				+            try:
			
 
				+                diff = parsed_x1 - parsed_x2
			
 
				+            except TypeError:
			
 
				+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
			
 
				+                return False
			
 
				+
			
 
				+            try:
			
 
				+                if sympy.simplify(diff) == 0:
			
 
				+                    return True
			
 
				+                else:
			
 
				+                    return False
			
 
				+            except ValueError:
			
 
				+                eval_logger.debug(
			
 
				+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
			
 
				+                )
			
 
				+    except TimeoutError:
			
 
				+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
			
 
				+        return False
			
 
				+    except ImportError as e:
			
 
				+        eval_logger.error(e)
			
 
				+        raise
			
 
				+    except Exception as e:
			
 
				+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def get_unnormalized_answer(text: str) -> str:
			
 
				+    INVALID_ANSWER = "[invalidanswer]"
			
 
				+    end_seq = "I hope it is correct."
			
 
				+    text += end_seq
			
 
				+    match = re.search(
			
 
				+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
			
 
				+        text,
			
 
				+    )
			
 
				+    if match:
			
 
				+        return match.group(1).strip()
			
 
				+    else:
			
 
				+        return INVALID_ANSWER
			
 
				+
			
 
				+
			
 
				+SUBSTITUTIONS = [
			
 
				+    ("an ", ""),
			
 
				+    ("a ", ""),
			
 
				+    (".$", "$"),
			
 
				+    ("\\$", ""),
			
 
				+    (r"\ ", ""),
			
 
				+    (" ", ""),
			
 
				+    ("mbox", "text"),
			
 
				+    (",\\text{and}", ","),
			
 
				+    ("\\text{and}", ","),
			
 
				+    ("\\text{m}", "\\text{}"),
			
 
				+]
			
 
				+REMOVED_EXPRESSIONS = [
			
 
				+    "square",
			
 
				+    "ways",
			
 
				+    "integers",
			
 
				+    "dollars",
			
 
				+    "mph",
			
 
				+    "inches",
			
 
				+    "ft",
			
 
				+    "hours",
			
 
				+    "km",
			
 
				+    "units",
			
 
				+    "\\ldots",
			
 
				+    "sue",
			
 
				+    "points",
			
 
				+    "feet",
			
 
				+    "minutes",
			
 
				+    "digits",
			
 
				+    "cents",
			
 
				+    "degrees",
			
 
				+    "cm",
			
 
				+    "gm",
			
 
				+    "pounds",
			
 
				+    "meters",
			
 
				+    "meals",
			
 
				+    "edges",
			
 
				+    "students",
			
 
				+    "childrentickets",
			
 
				+    "multiples",
			
 
				+    "\\text{s}",
			
 
				+    "\\text{.}",
			
 
				+    "\\text{\ns}",
			
 
				+    "\\text{}^2",
			
 
				+    "\\text{}^3",
			
 
				+    "\\text{\n}",
			
 
				+    "\\text{}",
			
 
				+    r"\mathrm{th}",
			
 
				+    r"^\circ",
			
 
				+    r"^{\circ}",
			
 
				+    r"\;",
			
 
				+    r",\!",
			
 
				+    "{,}",
			
 
				+    '"',
			
 
				+    "\\dots",
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def normalize_final_answer(final_answer: str) -> str:
			
 
				+    """
			
 
				+    Normalize a final answer to a quantitative reasoning question.
			
 
				+
			
 
				+    Copied character for character from appendix D of Lewkowycz et al. (2022)
			
 
				+    """
			
 
				+    final_answer = final_answer.split("=")[-1]
			
 
				+
			
 
				+    for before, after in SUBSTITUTIONS:
			
 
				+        final_answer = final_answer.replace(before, after)
			
 
				+    for expr in REMOVED_EXPRESSIONS:
			
 
				+        final_answer = final_answer.replace(expr, "")
			
 
				+
			
 
				+    # Extract answer that is in LaTeX math, is bold,
			
 
				+    # is surrounded by a box, etc.
			
 
				+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
			
 
				+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
			
 
				+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
			
 
				+
			
 
				+    # Normalize shorthand TeX:
			
 
				+    #  \fracab -> \frac{a}{b}
			
 
				+    #  \frac{abc}{bef} -> \frac{abc}{bef}
			
 
				+    #  \fracabc -> \frac{a}{b}c
			
 
				+    #  \sqrta -> \sqrt{a}
			
 
				+    #  \sqrtab -> sqrt{a}b
			
 
				+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
			
 
				+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
			
 
				+    final_answer = final_answer.replace("$", "")
			
 
				+
			
 
				+    # Normalize 100,000 -> 100000
			
 
				+    if final_answer.replace(",", "").isdigit():
			
 
				+        final_answer = final_answer.replace(",", "")
			
 
				+
			
 
				+    return final_answer
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml
@@ -0,0 +1,3 @@
 
				+group: meta_instruct
			
 
				+task:
			
 
				+- meta_mmlu_pro_instruct
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml
@@ -0,0 +1,4 @@
 
				+group: meta_pretrain
			
 
				+task:
			
 
				+- meta_bbh
			
 
				+- meta_mmlu_pro_pretrain
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml
@@ -0,0 +1,14 @@
 
				+task: meta_mmlu
			
 
				+dataset_path: meta-llama/Llama-3.1-70B-evals
			
 
				+dataset_name: Llama-3.1-70B-evals__mmlu__details
			
 
				+test_split: latest
			
 
				+output_type: multiple_choice
			
 
				+process_docs: !function utils.process_docs
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: !function utils.doc_to_target
			
 
				+doc_to_choice: ["A", "B", "C", "D"]
			
 
				+# 5-shot prompts are already included in the dataset
			
 
				+# So no need to generate
			
 
				+num_fewshot: 0
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py
@@ -0,0 +1,31 @@
 
				+import string
			
 
				+import datasets
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    # Strip out the last two characters, which is a space and the answer
			
 
				+    # E.g., "Answer: B" -> "Answer:"
			
 
				+    return doc["input_final_prompts"][0][:-2]
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        # input_correct_responses is in format of: "Answer: B"
			
 
				+        answer = doc["input_correct_responses"][0]
			
 
				+        # Indexes are always A: 0, B: 1, C: 2, D: 3
			
 
				+        answer_index = string.ascii_uppercase.index(answer[-1])
			
 
				+
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            # The answer is the index of the correct response (0-indexed)
			
 
				+            "gold": answer_index,
			
 
				+        }
			
 
				+        return out_doc
			
 
				+
			
 
				+    dataset = dataset.select_columns(
			
 
				+        ["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
			
 
				+         "input_choice_list"])
			
 
				+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
 
				+
			
 
				+def doc_to_target(doc: dict) -> str:
			
 
				+    return doc["gold"]
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
 
				+task: meta_mmlu_pro_instruct
			
 
				+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
			
 
				+dataset_name: Llama-3.1-70B-Instruct-evals__mmlu_pro__details
			
 
				+test_split: latest
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        group_select: -1
			
 
				+        regex_pattern: 'best answer is ([A-Z])'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: []
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 1024
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -0,0 +1,28 @@
 
				+task: meta_mmlu_pro_pretrain
			
 
				+dataset_path: meta-llama/Llama-3.1-70B-evals
			
 
				+dataset_name: Llama-3.1-70B-evals__mmlu_pro__details
			
 
				+test_split: latest
			
 
				+output_type: generate_until
			
 
				+process_docs: !function utils.process_docs
			
 
				+doc_to_text: !function utils.doc_to_text
			
 
				+doc_to_target: gold
			
 
				+filter_list:
			
 
				+  - name: "strict-match"
			
 
				+    filter:
			
 
				+      - function: "regex"
			
 
				+        regex_pattern: 'answer is \(([A-Z])\)'
			
 
				+      - function: "take_first"
			
 
				+generation_kwargs:
			
 
				+  until: "\n\nQ: "
			
 
				+  do_sample: false
			
 
				+  temperature: 0
			
 
				+  max_gen_toks: 512
			
 
				+num_fewshot: 0
			
 
				+metric_list:
			
 
				+  - metric: exact_match
			
 
				+    aggregation: mean
			
 
				+    higher_is_better: true
			
 
				+    ignore_case: true
			
 
				+    ignore_punctuation: true
			
 
				+metadata:
			
 
				+  version: 1.0
			
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py
@@ -0,0 +1,135 @@
 
				+import string
			
 
				+
			
 
				+import datasets
			
 
				+
			
 
				+
			
 
				+def doc_to_text(doc: dict) -> str:
			
 
				+    doc = [
			
 
				+        {
			
 
				+            "question": "Explain what difficulties would arise if messenger RNA molecules were not destroyed after they had produced some polypeptide chains.",
			
 
				+            "options": {
			
 
				+                "A": "mRNA would replicate rapidly",
			
 
				+                "B": "The cell would use mRNA as a source of energy",
			
 
				+                "C": "The cell would lack proteins",
			
 
				+                "D": "Cell would enter a state of permanent division",
			
 
				+                "E": "mRNA would be transformed into DNA",
			
 
				+                "F": "Excess protein production, energy depletion, and potential harm to the cell",
			
 
				+                "G": "mRNA would exit the cell and infect neighboring cells",
			
 
				+                "H": "Proteins would be broken down into mRNA",
			
 
				+                "I": "mRNA would become part of the cell membrane",
			
 
				+                "J": "mRNA would bind to lipids and carbohydrates, disrupting cellular metabolism",
			
 
				+            },
			
 
				+            "answer": "F",
			
 
				+        },
			
 
				+        {
			
 
				+            "question": "Based on the characteristic population curves that result from plotting population growth of a species, the most effective means of controlling the mosquito population is to",
			
 
				+            "options": {
			
 
				+                "A": "opt for zero population control once the K value of the curve has been reached",
			
 
				+                "B": "maintain the population at the highest point of its logistic curve",
			
 
				+                "C": "reduce the carrying capacity cif the environment to lower the K value",
			
 
				+                "D": "decrease the mortality rate",
			
 
				+                "E": "increase the birth rate of the species",
			
 
				+                "F": "drastically reduce the population below the K value",
			
 
				+                "G": "maintain the population at a point corresponding to the midpoint of its logistic curve",
			
 
				+                "H": "increase the carrying capacity of the environment to raise the K value",
			
 
				+                "I": "opt for zero population control at the beginning of the logistic curve",
			
 
				+                "J": "null",
			
 
				+            },
			
 
				+            "answer": "C",
			
 
				+        },
			
 
				+        {
			
 
				+            "question": "Solve the equation 1.2 = 0.4y using mental math.",
			
 
				+            "options": {
			
 
				+                "A": "3",
			
 
				+                "B": "\u22123",
			
 
				+                "C": "0.8",
			
 
				+                "D": "2",
			
 
				+                "E": "0.3",
			
 
				+                "F": "5",
			
 
				+                "G": "\u22124",
			
 
				+                "H": "4",
			
 
				+                "I": "6",
			
 
				+                "J": "1.6",
			
 
				+            },
			
 
				+            "answer": "A",
			
 
				+        },
			
 
				+        {
			
 
				+            "question": "assume you are Indonesian. In 2010, the rupiah exchange rate was around IDR15,000/USD, and the consumer price index in Indonesia and the United States was at 100. In 2019, the exchange rate changed to IDR14,000/USD. Simultaneously, Indonesia\u2019s inflation rose 5% due to the consumer price index rising to 105. Meanwhile, the United States\u2019 inflation rate rose 10% due to the consumer price index rising to 110. Whats the real exchange rate?",
			
 
				+            "options": {
			
 
				+                "A": "14000.00",
			
 
				+                "B": "15500.00",
			
 
				+                "C": "15000.00",
			
 
				+                "D": "16000.00",
			
 
				+                "E": "13500.00",
			
 
				+                "F": "14666.67",
			
 
				+                "G": "13888.89",
			
 
				+                "H": "14800.00",
			
 
				+                "I": "15200.00",
			
 
				+                "J": "13333.33",
			
 
				+            },
			
 
				+            "answer": "F",
			
 
				+        },
			
 
				+        {
			
 
				+            "question": "To move the economy closer to full employment the central bank decides that the federal funds rate must be increased. The appropriate open market operation is to ______ which ______ the money supply ______ aggregate demand and fights ______. OMO \u00a0\u00a0\u00a0 MONEY SUPPLY \u00a0\u00a0\u00a0 AD \u00a0\u00a0\u00a0 TO FIGHT",
			
 
				+            "options": {
			
 
				+                "A": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
			
 
				+                "B": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
			
 
				+                "C": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
			
 
				+                "D": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
			
 
				+                "E": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Unemployment",
			
 
				+                "F": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
			
 
				+                "G": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
			
 
				+                "H": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
			
 
				+                "I": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
			
 
				+                "J": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
			
 
				+            },
			
 
				+            "answer": "H",
			
 
				+        },
			
 
				+    ]
			
 
				+    formatted_text = ""
			
 
				+    for question in doc:
			
 
				+        # Format user message with question and options
			
 
				+        user_msg = "<|start_header_id|>user<|end_header_id|>\n\n"
			
 
				+        user_msg += "Given the following question and candidate answers, choose the best answer.\n"
			
 
				+        user_msg += f"Question: {question['question']}\n"
			
 
				+
			
 
				+        # Add options
			
 
				+        for option_key, option_value in question["options"].items():
			
 
				+            if option_value is not None:
			
 
				+                user_msg += f"{option_key}. {option_value}\n"
			
 
				+
			
 
				+        user_msg += '\nYour response should end with "The best answer is [the_answer_letter]." where the [the_answer_letter] is a letter from the provided choices.\n\n'
			
 
				+        user_msg += "Let's think step by step.<|eot_id|>"
			
 
				+
			
 
				+        # Add assistant placeholder message
			
 
				+        assistant_msg = "<|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>"
			
 
				+
			
 
				+        formatted_text += user_msg + assistant_msg
			
 
				+
			
 
				+    return formatted_text
			
 
				+
			
 
				+
			
 
				+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
			
 
				+    def _process_doc(doc: dict) -> dict:
			
 
				+        out_doc = {
			
 
				+            "problem": doc["input_question"],
			
 
				+            "gold": doc["input_correct_responses"][0],
			
 
				+        }
			
 
				+        return out_doc
			
 
				+
			
 
				+    dataset = dataset.select(range(1200, len(dataset)))
			
 
				+
			
 
				+    dataset = dataset.select_columns(
			
 
				+        [
			
 
				+            "input_question",
			
 
				+            "input_correct_responses",
			
 
				+            "input_final_prompts",
			
 
				+            "is_correct",
			
 
				+            "input_question_hash",
			
 
				+            "input_choice_list",
			
 
				+            "output_prediction_text",
			
 
				+        ]
			
 
				+    )
			
 
				+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
			
 
				+    dataset = dataset.map(_process_doc)
			
 
				+    return dataset.map(_process_doc)
			
--- a/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
@@ -5,7 +5,7 @@ import dspy
 
				 from datasets import load_dataset
			
 
				 
			
 
				 from .datatypes import TaskDatasets
			
 
				-from .helpers import train_val_test_split
			
 
				+from .helpers import fixed_split, train_val_test_split
			
 
				 
			
 
				 
			
 
				 def datasets(
			
@@ -19,12 +19,7 @@ def datasets(
 
				         "meta-llama/Llama-3.3-70B-Instruct-evals",
			
 
				         "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
			
 
				     )
			
 
				-    return train_val_test_split(
			
 
				-        dataset["latest"],
			
 
				-        _task_doc_example,
			
 
				-        train_size,
			
 
				-        validation_size,
			
 
				-    )
			
 
				+    return fixed_split(dataset["latest"], _task_doc_example)
			
 
				 
			
 
				 
			
 
				 class TaskDoc(t.TypedDict):
			
@@ -58,7 +53,7 @@ def _task_doc_example(doc: TaskDoc) -> dspy.Example:
 
				         answer=doc["output_parsed_answer"],
			
 
				     )
			
 
				     example._input_keys = {"question", "options"}
			
 
				-    example._output_keys = {"answer"} 
			
 
				+    example._output_keys = {"answer"}
			
 
				     return example
			
 
				 
			
 
				 
			
--- a/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb
+++ b/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb