1 year ago · ff10442db1
--- a/tools/benchmarks/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/meta_eval_reproduce/README.md
--- a/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
@@ -1,4 +1,4 @@
 
				-model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				+model_name: "meta-llama/Meta-Llama-3.1-Instruct-8B" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
			
 
				 
			
 
				 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
			
@@ -8,7 +8,7 @@ tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "
 
				 
			
 
				 tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				 
			
 
				-data_parallel_size: 8 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				 
			
 
				 gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
			
 
				 
			
@@ -18,7 +18,7 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
				 
			
 
				 output_path: "eval_results" # the output folder to store all the eval results and samples.
			
 
				 
			
 
				-#limit: 16 # Limit number of examples per task, set 'null' to run all.
			
 
				+#limit: 12 # Limit number of examples per task, set 'null' to run all.
			
 
				 limit: null # Limit number of examples per task.
			
 
				 
			
 
				 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py
@@ -32,15 +32,16 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
				             "meta_target": doc["input_correct_responses"]
			
 
				         }
			
 
				         return out_doc
			
 
				-    #dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","output_prediction_text"])
			
 
				-    dataset = dataset.rename_column("is_correct","previously_is_correct")
			
 
				     return dataset.map(_process_doc)
			
 
				 
			
 
				 
			
 
				 def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
			
 
				     candidates = results[0]
			
 
				-
			
 
				-    unnormalized_answer = remove_boxed(last_boxed_only_string(candidates))
			
 
				+    last_boxed_string = last_boxed_only_string(candidates)
			
 
				+    if not last_boxed_string:
			
 
				+        # No boxed string found, so we can't evaluate
			
 
				+        return {"exact_match": 0}
			
 
				+    unnormalized_answer = remove_boxed(last_boxed_string)
			
 
				     answer = normalize_final_answer(unnormalized_answer)
			
 
				 
			
 
				     if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
			
--- a/tools/benchmarks/meta_eval_reproduce/prepare_dataset.py
+++ b/tools/benchmarks/meta_eval_reproduce/prepare_dataset.py
@@ -4,7 +4,6 @@ def get_ifeval_data(model_name,output_dir):
 
				     if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
			
 
				         raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
			
 
				     original_dataset_name = "wis-k/instruction-following-eval"
			
 
				-    #meta_dataset_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-evals"
			
 
				     meta_dataset_name = f"meta-llama/{model_name}-evals"
			
 
				     meta_data = load_dataset(
			
 
				         meta_dataset_name,
			
@@ -19,11 +18,6 @@ def get_ifeval_data(model_name,output_dir):
 
				     meta_df = meta_data.to_pandas()
			
 
				     ifeval_df = ifeval_data.to_pandas()
			
 
				     ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
			
 
				-    print("meta_df",meta_df.columns)
			
 
				-    print(meta_df)
			
 
				-    print("ifeval_df",ifeval_df.columns)
			
 
				-
			
 
				-    print(ifeval_df)
			
 
				 
			
 
				     joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
			
 
				     joined = joined.rename(columns={"input_final_prompts": "prompt"})
			
@@ -31,7 +25,6 @@ def get_ifeval_data(model_name,output_dir):
 
				     joined = Dataset.from_pandas(joined)
			
 
				     joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
			
 
				     joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				-    print(joined)
			
 
				     for item in joined:
			
 
				         check_sample(item)
			
 
				     joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
			
@@ -52,22 +45,15 @@ def get_math_data(model_name,output_dir):
 
				     meta_df = meta_data.to_pandas()
			
 
				     math_df = math_data.to_pandas()
			
 
				     math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				-    print("meta_df",meta_df.columns)
			
 
				-    print(meta_df)
			
 
				-    print("math_df",math_df.columns)
			
 
				-
			
 
				-    print(math_df)
			
 
				 
			
 
				     joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				-    # joined = Dataset.from_pandas(joined)
			
 
				-    # joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				-    # joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				-    # joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				-    print(joined)
			
 
				-    # for item in joined:
			
 
				-    #     check_sample(item)
			
 
				+    joined = Dataset.from_pandas(joined)
			
 
				+    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				+    joined = joined.rename_column("is_correct","previous_is_correct")
			
 
				+    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				+    for item in joined:
			
 
				+        check_sample(item)
			
 
				     joined.to_parquet(output_dir + f"/joined_math.parquet")
			
 
				-    #joined.save_to_disk(output_dir + f"/joined_math")
			
 
				 def get_question(example):
			
 
				     try:
			
 
				         example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")