1 年之前 · 1f666708cc
--- a/tools/benchmarks/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/meta_eval_reproduce/README.md
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -1,9 +1,9 @@
 
				+task: meta_mmlu_pro_instruct
			
 
				 dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
			
 
				 dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
			
 
				+test_split: latest
			
 
				 output_type: generate_until
			
 
				-task: meta_mmlu_pro_instruct
			
 
				 process_docs: !function utils.process_docs
			
 
				-test_split: latest
			
 
				 doc_to_text: !function utils.doc_to_text
			
 
				 doc_to_target: gold
			
 
				 filter_list:
			
@@ -15,6 +15,7 @@ filter_list:
 
				       - function: "take_first"
			
 
				 generation_kwargs:
			
 
				   until: []
			
 
				+  do_sample: false
			
 
				   temperature: 0
			
 
				   max_gen_toks: 1024
			
 
				 num_fewshot: 0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -1,9 +1,9 @@
 
				+task: meta_mmlu_pro_pretrain
			
 
				 dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
			
 
				 dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
			
 
				+test_split: latest
			
 
				 output_type: generate_until
			
 
				-task: meta_mmlu_pro_pretrain
			
 
				 process_docs: !function utils.process_docs
			
 
				-test_split: latest
			
 
				 doc_to_text: !function utils.doc_to_text
			
 
				 doc_to_target: gold
			
 
				 filter_list:
			
@@ -14,6 +14,7 @@ filter_list:
 
				       - function: "take_first"
			
 
				 generation_kwargs:
			
 
				   until: "\n\nQ: "
			
 
				+  do_sample: false
			
 
				   temperature: 0
			
 
				   max_gen_toks: 512
			
 
				 num_fewshot: 0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
@@ -13,7 +13,6 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
				         out_doc = {
			
 
				             "problem": doc["input_question"],
			
 
				             "gold": doc["input_correct_responses"][0],
			
 
				-            "choices": list(doc["input_choice_list"])
			
 
				         }
			
 
				         return out_doc
			
 
				     dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])