hace 1 año · ed201fc8e5
--- a/tools/benchmarks/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/meta_eval_reproduce/README.md
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
@@ -12,7 +12,6 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
				         out_doc = {
			
 
				             "problem": doc["input_question"],
			
 
				             "gold": doc["input_correct_responses"][0],
			
 
				-            "choices": list(doc["input_choice_list"])
			
 
				         }
			
 
				         return out_doc
			
 
				     dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
@@ -37,46 +37,10 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
				     return dataset.map(_process_doc)
			
 
				 
			
 
				 
			
 
				-
			
 
				-def extract_result_from_boxed(answer: str) -> str:
			
 
				-    box_start = "\\boxed"
			
 
				-    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
			
 
				-    start = answer.rfind(box_start)
			
 
				-    if start < 0:
			
 
				-        return ""
			
 
				-    answer = answer[start + len(box_start) :].strip()
			
 
				-    ends_with_curly = answer.startswith("{")
			
 
				-    i = 0
			
 
				-    open_braces = 0
			
 
				-    while i < len(answer):
			
 
				-        if answer[i] == "{":
			
 
				-            open_braces += 1
			
 
				-        elif answer[i] == "}":
			
 
				-            open_braces -= 1
			
 
				-        if open_braces == 0:
			
 
				-            if ends_with_curly:
			
 
				-                answer = answer[: i + 1].strip()
			
 
				-                break
			
 
				-            elif answer[i] == "$":
			
 
				-                answer = answer[:i].strip()
			
 
				-                break
			
 
				-        i += 1
			
 
				-    else:
			
 
				-        return ""
			
 
				-    # remove extra curly braces
			
 
				-    while True:
			
 
				-        if answer.startswith("{") and answer.endswith("}"):
			
 
				-            answer = answer[1:-1].strip()
			
 
				-        else:
			
 
				-            break
			
 
				-    return answer
			
 
				-
			
 
				 def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
			
 
				     candidates = results[0]
			
 
				 
			
 
				-    unnormalized_answer = get_unnormalized_answer(candidates)
			
 
				-    if unnormalized_answer == "[invalidanswer]":
			
 
				-        unnormalized_answer = extract_result_from_boxed(candidates)
			
 
				+    unnormalized_answer = remove_boxed(last_boxed_only_string(candidates))
			
 
				     answer = normalize_final_answer(unnormalized_answer)
			
 
				 
			
 
				     if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):