1 年之前 · 37be8e9923
--- a/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
@@ -3,12 +3,12 @@ model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to e
 
																 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
															
 
																 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
															
 
																-tasks: "meta_math_hard" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
															
 
																+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
															
 
																 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
															
 
																-tensor_parallel_size: 2 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
															
 
																+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
															
 
																-data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
															
 
																+data_parallel_size: 8 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
															
 
																 gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
															
@@ -18,8 +18,8 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
																 output_path: "eval_results" # the output folder to store all the eval results and samples.
															
 
																-limit: 16 # Limit number of examples per task, set 'null' to run all.
															
 
																-#limit: null # Limit number of examples per task.
															
 
																+#limit: 16 # Limit number of examples per task, set 'null' to run all.
															
 
																+limit: null # Limit number of examples per task.
															
 
																 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
															
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
@@ -1,7 +1,7 @@
 
																 task: meta_ifeval
															
 
																-dataset_path: json
															
 
																+dataset_path: parquet
															
 
																 dataset_kwargs:
															
 
																-  data_files: ../joined_ifeval.json
															
 
																+  data_files: ./work_dir/joined_ifeval.parquet
															
 
																 output_type: generate_until
															
 
																 test_split: train
															
 
																 num_fewshot: 0
															
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
@@ -1,46 +0,0 @@
 
																-from datasets import load_dataset,Dataset
															
 
																-import os
															
 
																-import yaml
															
 
																-# def check_sample(example):
															
 
																-#     if "kwargs" in example and not example["kwargs"]:
															
 
																-#         print(example)
															
 
																-#         raise ValueError("This example did not got ds for IFeval")
															
 
																-#     if "solution" in example and not example["solution"]:
															
 
																-#         print(example)
															
 
																-#         raise ValueError("This example did not got ds for MATH_hard")
															
 
																-def load_config(config_path: str = "./eval_config.yaml"):
															
 
																-    # Read the YAML configuration file
															
 
																-    with open(config_path, "r") as file:
															
 
																-        config = yaml.safe_load(file)
															
 
																-    return config
															
 
																-# current_dir = os.getcwd()
															
 
																-# print("current_dir",current_dir)
															
 
																-# yaml = load_config(str(current_dir)+"/eval_config.yaml")
															
 
																-# meta_dataset_name = yaml["evals_dataset"]
															
 
																-# model_name = meta_dataset_name.split("/")[-1].replace("-evals","")
															
 
																-# original_dataset_name = "lighteval/MATH-Hard"
															
 
																-
															
 
																-# meta_data = load_dataset(
															
 
																-#     meta_dataset_name,
															
 
																-#     name=f"{model_name}-evals__math_hard__details",
															
 
																-#     split="latest"
															
 
																-#     )
															
 
																-# math_data = load_dataset(
															
 
																-#     original_dataset_name,
															
 
																-#     split="test"
															
 
																-#     )
															
 
																-# meta_df = meta_data.to_pandas()
															
 
																-# math_df = math_data.to_pandas()
															
 
																-# math_df = math_df.rename(columns={"problem": "input_question"})
															
 
																-
															
 
																-# joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
															
 
																-# ds = Dataset.from_pandas(joined)
															
 
																-# ds = ds.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
															
 
																-# ds = ds.rename_column("is_correct","previous_is_correct")
															
 
																-# ds = ds.rename_column("output_prediction_text","previous_output_prediction_text")
															
 
																-from datasets import load_dataset
															
 
																-current_dir = os.getcwd()
															
 
																-print("current_dir",current_dir)
															
 
																-yaml = load_config(str(current_dir)+"/eval_config.yaml")
															
 
																-work_dir = yaml["work_dir"]
															
 
																-load_dataset('parquet', data_files=str(current_dir)+"/"+work_dir+"/joined_math.parquet")
															
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
@@ -1,9 +1,6 @@
 
																 dataset_path: parquet
															
 
																-dataset_name: null
															
 
																 dataset_kwargs:
															
 
																-  data_files:
															
 
																-    train: /home/kaiwu/work/llama-recipes/tools/benchmarks/meta_eval_reproduce/work_dir/joined_math.parquet
															
 
																-  trust_remote_code: true
															
 
																+  data_files: ./work_dir/joined_math.parquet
															
 
																 task: meta_math_hard
															
 
																 process_docs: !function utils.process_docs
															
 
																 output_type: generate_until
															
@@ -22,5 +19,3 @@ metric_list:
 
																     higher_is_better: true
															
 
																 metadata:
															
 
																   version: 1.0
															
 
																-dataset_kwargs:
															
 
																-  trust_remote_code: true