Bläddra i källkod

setup meta-eval for benchmark, ray error

Justin Lee 2 månader sedan
förälder
incheckning
dc406b4769
21 ändrade filer med 1384 tillägg och 15369 borttagningar
  1. 3 3
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
  2. 28 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml
  3. 21 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py
  4. 29 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml
  5. 19 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py
  6. 29 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml
  7. 20 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py
  8. 32 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml
  9. 139 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py
  10. 21 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml
  11. 21 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml
  12. 268 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py
  13. 3 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml
  14. 4 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml
  15. 14 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml
  16. 31 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py
  17. 29 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
  18. 28 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
  19. 135 0
      end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py
  20. 3 8
      end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
  21. 507 15358
      end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb

+ 3 - 3
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml

@@ -1,9 +1,9 @@
-model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Llama-3.3-70B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
+evals_dataset: "meta-llama/Llama-3.1-70B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
 
-tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+tasks: "meta_mmlu_pro_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 # Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
 # Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
 # Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them

+ 28 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml

@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Llama-3.1-70B-evals
+dataset_name: Llama-3.1-70B-evals__bbh__details
+task: meta_bbh
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'the answer is (.*?)\.'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 21 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py

@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 29 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml

@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
+task: meta_gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: ' ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 19 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py

@@ -0,0 +1,19 @@
+import random
+import re
+
+import datasets
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 29 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml

@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
+task: meta_gpqa_cot
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 20 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py

@@ -0,0 +1,20 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 32 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml

@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_ifeval.parquet
+output_type: generate_until
+test_split: train
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n

+ 139 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py

@@ -0,0 +1,139 @@
+import dataclasses
+from typing import Dict, Optional, Union
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+                
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc

+ 21 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml

@@ -0,0 +1,21 @@
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_math.parquet
+task: meta_math
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0

+ 21 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml

@@ -0,0 +1,21 @@
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_math_hard.parquet
+task: meta_math_hard
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 5120
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0

+ 268 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py

@@ -0,0 +1,268 @@
+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
+import re
+import signal
+from typing import Dict, List, Optional
+
+import datasets
+
+from lm_eval.utils import eval_logger
+
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": normalize_final_answer(
+                 remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+            "meta_target": doc["input_correct_responses"]
+        }
+        return out_doc
+    return dataset.map(_process_doc)
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+    last_boxed_string = last_boxed_only_string(candidates)
+    if not last_boxed_string:
+        # No boxed string found, so we can't evaluate
+        return {"exact_match": 0}
+    unnormalized_answer = remove_boxed(last_boxed_string)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer

+ 3 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml

@@ -0,0 +1,3 @@
+group: meta_instruct
+task:
+- meta_mmlu_pro_instruct

+ 4 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml

@@ -0,0 +1,4 @@
+group: meta_pretrain
+task:
+- meta_bbh
+- meta_mmlu_pro_pretrain

+ 14 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml

@@ -0,0 +1,14 @@
+task: meta_mmlu
+dataset_path: meta-llama/Llama-3.1-70B-evals
+dataset_name: Llama-3.1-70B-evals__mmlu__details
+test_split: latest
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: ["A", "B", "C", "D"]
+# 5-shot prompts are already included in the dataset
+# So no need to generate
+num_fewshot: 0
+metadata:
+  version: 1.0

+ 31 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py

@@ -0,0 +1,31 @@
+import string
+import datasets
+
+def doc_to_text(doc: dict) -> str:
+    # Strip out the last two characters, which is a space and the answer
+    # E.g., "Answer: B" -> "Answer:"
+    return doc["input_final_prompts"][0][:-2]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        # input_correct_responses is in format of: "Answer: B"
+        answer = doc["input_correct_responses"][0]
+        # Indexes are always A: 0, B: 1, C: 2, D: 3
+        answer_index = string.ascii_uppercase.index(answer[-1])
+
+        out_doc = {
+            "problem": doc["input_question"],
+            # The answer is the index of the correct response (0-indexed)
+            "gold": answer_index,
+        }
+        return out_doc
+
+    dataset = dataset.select_columns(
+        ["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
+         "input_choice_list"])
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
+
+def doc_to_target(doc: dict) -> str:
+    return doc["gold"]

+ 29 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml

@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 28 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml

@@ -0,0 +1,28 @@
+task: meta_mmlu_pro_pretrain
+dataset_path: meta-llama/Llama-3.1-70B-evals
+dataset_name: Llama-3.1-70B-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'answer is \(([A-Z])\)'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0

+ 135 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py

@@ -0,0 +1,135 @@
+import string
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+    doc = [
+        {
+            "question": "Explain what difficulties would arise if messenger RNA molecules were not destroyed after they had produced some polypeptide chains.",
+            "options": {
+                "A": "mRNA would replicate rapidly",
+                "B": "The cell would use mRNA as a source of energy",
+                "C": "The cell would lack proteins",
+                "D": "Cell would enter a state of permanent division",
+                "E": "mRNA would be transformed into DNA",
+                "F": "Excess protein production, energy depletion, and potential harm to the cell",
+                "G": "mRNA would exit the cell and infect neighboring cells",
+                "H": "Proteins would be broken down into mRNA",
+                "I": "mRNA would become part of the cell membrane",
+                "J": "mRNA would bind to lipids and carbohydrates, disrupting cellular metabolism",
+            },
+            "answer": "F",
+        },
+        {
+            "question": "Based on the characteristic population curves that result from plotting population growth of a species, the most effective means of controlling the mosquito population is to",
+            "options": {
+                "A": "opt for zero population control once the K value of the curve has been reached",
+                "B": "maintain the population at the highest point of its logistic curve",
+                "C": "reduce the carrying capacity cif the environment to lower the K value",
+                "D": "decrease the mortality rate",
+                "E": "increase the birth rate of the species",
+                "F": "drastically reduce the population below the K value",
+                "G": "maintain the population at a point corresponding to the midpoint of its logistic curve",
+                "H": "increase the carrying capacity of the environment to raise the K value",
+                "I": "opt for zero population control at the beginning of the logistic curve",
+                "J": "null",
+            },
+            "answer": "C",
+        },
+        {
+            "question": "Solve the equation 1.2 = 0.4y using mental math.",
+            "options": {
+                "A": "3",
+                "B": "\u22123",
+                "C": "0.8",
+                "D": "2",
+                "E": "0.3",
+                "F": "5",
+                "G": "\u22124",
+                "H": "4",
+                "I": "6",
+                "J": "1.6",
+            },
+            "answer": "A",
+        },
+        {
+            "question": "assume you are Indonesian. In 2010, the rupiah exchange rate was around IDR15,000/USD, and the consumer price index in Indonesia and the United States was at 100. In 2019, the exchange rate changed to IDR14,000/USD. Simultaneously, Indonesia\u2019s inflation rose 5% due to the consumer price index rising to 105. Meanwhile, the United States\u2019 inflation rate rose 10% due to the consumer price index rising to 110. Whats the real exchange rate?",
+            "options": {
+                "A": "14000.00",
+                "B": "15500.00",
+                "C": "15000.00",
+                "D": "16000.00",
+                "E": "13500.00",
+                "F": "14666.67",
+                "G": "13888.89",
+                "H": "14800.00",
+                "I": "15200.00",
+                "J": "13333.33",
+            },
+            "answer": "F",
+        },
+        {
+            "question": "To move the economy closer to full employment the central bank decides that the federal funds rate must be increased. The appropriate open market operation is to ______ which ______ the money supply ______ aggregate demand and fights ______. OMO \u00a0\u00a0\u00a0 MONEY SUPPLY \u00a0\u00a0\u00a0 AD \u00a0\u00a0\u00a0 TO FIGHT",
+            "options": {
+                "A": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+                "B": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+                "C": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+                "D": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
+                "E": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Unemployment",
+                "F": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+                "G": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+                "H": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
+                "I": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+                "J": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+            },
+            "answer": "H",
+        },
+    ]
+    formatted_text = ""
+    for question in doc:
+        # Format user message with question and options
+        user_msg = "<|start_header_id|>user<|end_header_id|>\n\n"
+        user_msg += "Given the following question and candidate answers, choose the best answer.\n"
+        user_msg += f"Question: {question['question']}\n"
+
+        # Add options
+        for option_key, option_value in question["options"].items():
+            if option_value is not None:
+                user_msg += f"{option_key}. {option_value}\n"
+
+        user_msg += '\nYour response should end with "The best answer is [the_answer_letter]." where the [the_answer_letter] is a letter from the provided choices.\n\n'
+        user_msg += "Let's think step by step.<|eot_id|>"
+
+        # Add assistant placeholder message
+        assistant_msg = "<|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>"
+
+        formatted_text += user_msg + assistant_msg
+
+    return formatted_text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    dataset = dataset.select(range(1200, len(dataset)))
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ]
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)

+ 3 - 8
end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py

@@ -5,7 +5,7 @@ import dspy
 from datasets import load_dataset
 
 from .datatypes import TaskDatasets
-from .helpers import train_val_test_split
+from .helpers import fixed_split, train_val_test_split
 
 
 def datasets(
@@ -19,12 +19,7 @@ def datasets(
         "meta-llama/Llama-3.3-70B-Instruct-evals",
         "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
     )
-    return train_val_test_split(
-        dataset["latest"],
-        _task_doc_example,
-        train_size,
-        validation_size,
-    )
+    return fixed_split(dataset["latest"], _task_doc_example)
 
 
 class TaskDoc(t.TypedDict):
@@ -58,7 +53,7 @@ def _task_doc_example(doc: TaskDoc) -> dspy.Example:
         answer=doc["output_parsed_answer"],
     )
     example._input_keys = {"question", "options"}
-    example._output_keys = {"answer"} 
+    example._output_keys = {"answer"}
     return example
 
 

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 507 - 15358
end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb