1 anno fa · 37be8e9923
--- a/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/eval_config.yaml
@@ -3,12 +3,12 @@ model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to e
 
				 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
			
 
				 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
			
 
				 
			
 
				-tasks: "meta_math_hard" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
			
 
				 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
			
 
				 
			
 
				-tensor_parallel_size: 2 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
			
 
				 
			
 
				-data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				+data_parallel_size: 8 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
			
 
				 
			
 
				 gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
			
 
				 
			
@@ -18,8 +18,8 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
				 
			
 
				 output_path: "eval_results" # the output folder to store all the eval results and samples.
			
 
				 
			
 
				-limit: 16 # Limit number of examples per task, set 'null' to run all.
			
 
				-#limit: null # Limit number of examples per task.
			
 
				+#limit: 16 # Limit number of examples per task, set 'null' to run all.
			
 
				+limit: null # Limit number of examples per task.
			
 
				 
			
 
				 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
			
 
				 
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
@@ -1,7 +1,7 @@
 
				 task: meta_ifeval
			
 
				-dataset_path: json
			
 
				+dataset_path: parquet
			
 
				 dataset_kwargs:
			
 
				-  data_files: ../joined_ifeval.json
			
 
				+  data_files: ./work_dir/joined_ifeval.parquet
			
 
				 output_type: generate_until
			
 
				 test_split: train
			
 
				 num_fewshot: 0
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py
@@ -1,46 +0,0 @@
 
				-from datasets import load_dataset,Dataset
			
 
				-import os
			
 
				-import yaml
			
 
				-# def check_sample(example):
			
 
				-#     if "kwargs" in example and not example["kwargs"]:
			
 
				-#         print(example)
			
 
				-#         raise ValueError("This example did not got ds for IFeval")
			
 
				-#     if "solution" in example and not example["solution"]:
			
 
				-#         print(example)
			
 
				-#         raise ValueError("This example did not got ds for MATH_hard")
			
 
				-def load_config(config_path: str = "./eval_config.yaml"):
			
 
				-    # Read the YAML configuration file
			
 
				-    with open(config_path, "r") as file:
			
 
				-        config = yaml.safe_load(file)
			
 
				-    return config
			
 
				-# current_dir = os.getcwd()
			
 
				-# print("current_dir",current_dir)
			
 
				-# yaml = load_config(str(current_dir)+"/eval_config.yaml")
			
 
				-# meta_dataset_name = yaml["evals_dataset"]
			
 
				-# model_name = meta_dataset_name.split("/")[-1].replace("-evals","")
			
 
				-# original_dataset_name = "lighteval/MATH-Hard"
			
 
				-
			
 
				-# meta_data = load_dataset(
			
 
				-#     meta_dataset_name,
			
 
				-#     name=f"{model_name}-evals__math_hard__details",
			
 
				-#     split="latest"
			
 
				-#     )
			
 
				-# math_data = load_dataset(
			
 
				-#     original_dataset_name,
			
 
				-#     split="test"
			
 
				-#     )
			
 
				-# meta_df = meta_data.to_pandas()
			
 
				-# math_df = math_data.to_pandas()
			
 
				-# math_df = math_df.rename(columns={"problem": "input_question"})
			
 
				-
			
 
				-# joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
			
 
				-# ds = Dataset.from_pandas(joined)
			
 
				-# ds = ds.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
			
 
				-# ds = ds.rename_column("is_correct","previous_is_correct")
			
 
				-# ds = ds.rename_column("output_prediction_text","previous_output_prediction_text")
			
 
				-from datasets import load_dataset
			
 
				-current_dir = os.getcwd()
			
 
				-print("current_dir",current_dir)
			
 
				-yaml = load_config(str(current_dir)+"/eval_config.yaml")
			
 
				-work_dir = yaml["work_dir"]
			
 
				-load_dataset('parquet', data_files=str(current_dir)+"/"+work_dir+"/joined_math.parquet")
			
--- a/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
@@ -1,9 +1,6 @@
 
				 dataset_path: parquet
			
 
				-dataset_name: null
			
 
				 dataset_kwargs:
			
 
				-  data_files:
			
 
				-    train: /home/kaiwu/work/llama-recipes/tools/benchmarks/meta_eval_reproduce/work_dir/joined_math.parquet
			
 
				-  trust_remote_code: true
			
 
				+  data_files: ./work_dir/joined_math.parquet
			
 
				 task: meta_math_hard
			
 
				 process_docs: !function utils.process_docs
			
 
				 output_type: generate_until
			
@@ -22,5 +19,3 @@ metric_list:
 
				     higher_is_better: true
			
 
				 metadata:
			
 
				   version: 1.0
			
 
				-dataset_kwargs:
			
 
				-  trust_remote_code: true