Bläddra i källkod

add result table to README

Kai Wu 9 månader sedan
förälder
incheckning
ff10442db1

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 34 - 5
tools/benchmarks/meta_eval_reproduce/README.md


+ 3 - 3
tools/benchmarks/meta_eval_reproduce/eval_config.yaml

@@ -1,4 +1,4 @@
-model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Meta-Llama-3.1-Instruct-8B" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
@@ -8,7 +8,7 @@ tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "
 
 tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
 
-data_parallel_size: 8 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
 
 gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
 
@@ -18,7 +18,7 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
 output_path: "eval_results" # the output folder to store all the eval results and samples.
 
-#limit: 16 # Limit number of examples per task, set 'null' to run all.
+#limit: 12 # Limit number of examples per task, set 'null' to run all.
 limit: null # Limit number of examples per task.
 
 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.

+ 5 - 4
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/utils.py

@@ -32,15 +32,16 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
             "meta_target": doc["input_correct_responses"]
         }
         return out_doc
-    #dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","output_prediction_text"])
-    dataset = dataset.rename_column("is_correct","previously_is_correct")
     return dataset.map(_process_doc)
 
 
 def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
     candidates = results[0]
-
-    unnormalized_answer = remove_boxed(last_boxed_only_string(candidates))
+    last_boxed_string = last_boxed_only_string(candidates)
+    if not last_boxed_string:
+        # No boxed string found, so we can't evaluate
+        return {"exact_match": 0}
+    unnormalized_answer = remove_boxed(last_boxed_string)
     answer = normalize_final_answer(unnormalized_answer)
 
     if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):

+ 6 - 20
tools/benchmarks/meta_eval_reproduce/prepare_dataset.py

@@ -4,7 +4,6 @@ def get_ifeval_data(model_name,output_dir):
     if model_name not in ["Meta-Llama-3.1-8B-Instruct","Meta-Llama-3.1-70B-Instruct","Meta-Llama-3.1-405B-Instruct"]:
         raise ValueError("Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval")
     original_dataset_name = "wis-k/instruction-following-eval"
-    #meta_dataset_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_data = load_dataset(
         meta_dataset_name,
@@ -19,11 +18,6 @@ def get_ifeval_data(model_name,output_dir):
     meta_df = meta_data.to_pandas()
     ifeval_df = ifeval_data.to_pandas()
     ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
-    print("meta_df",meta_df.columns)
-    print(meta_df)
-    print("ifeval_df",ifeval_df.columns)
-
-    print(ifeval_df)
 
     joined = meta_df.join(ifeval_df.set_index('input_question'),on="input_question")
     joined = joined.rename(columns={"input_final_prompts": "prompt"})
@@ -31,7 +25,6 @@ def get_ifeval_data(model_name,output_dir):
     joined = Dataset.from_pandas(joined)
     joined = joined.select_columns(["input_question", "prompt", "previous_is_correct","instruction_id_list","kwargs","output_prediction_text","key"])
     joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    print(joined)
     for item in joined:
         check_sample(item)
     joined.to_parquet(output_dir + f"/joined_ifeval.parquet")
@@ -52,22 +45,15 @@ def get_math_data(model_name,output_dir):
     meta_df = meta_data.to_pandas()
     math_df = math_data.to_pandas()
     math_df = math_df.rename(columns={"problem": "input_question"})
-    print("meta_df",meta_df.columns)
-    print(meta_df)
-    print("math_df",math_df.columns)
-
-    print(math_df)
 
     joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
-    # joined = Dataset.from_pandas(joined)
-    # joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
-    # joined = joined.rename_column("is_correct","previous_is_correct")
-    # joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
-    print(joined)
-    # for item in joined:
-    #     check_sample(item)
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
+    joined = joined.rename_column("is_correct","previous_is_correct")
+    joined = joined.rename_column("output_prediction_text","previous_output_prediction_text")
+    for item in joined:
+        check_sample(item)
     joined.to_parquet(output_dir + f"/joined_math.parquet")
-    #joined.save_to_disk(output_dir + f"/joined_math")
 def get_question(example):
     try:
         example["input_question"] = eval(example["input_question"].replace("null","None").replace("true","True").replace("false","False"))["dialog"][0]["body"].replace("Is it True that the first song","Is it true that the first song").replace("Is the following True","Is the following true")