Browse Source

eval working

Kai Wu 9 months ago
parent
commit
37be8e9923

+ 5 - 5
tools/benchmarks/meta_eval_reproduce/eval_config.yaml

@@ -3,12 +3,12 @@ model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to e
 evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
 # Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
 
-tasks: "meta_math_hard" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
 
-tensor_parallel_size: 2 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
 
-data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+data_parallel_size: 8 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
 
 gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
 
@@ -18,8 +18,8 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
 
 output_path: "eval_results" # the output folder to store all the eval results and samples.
 
-limit: 16 # Limit number of examples per task, set 'null' to run all.
-#limit: null # Limit number of examples per task.
+#limit: 16 # Limit number of examples per task, set 'null' to run all.
+limit: null # Limit number of examples per task.
 
 verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
 

+ 2 - 2
tools/benchmarks/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml

@@ -1,7 +1,7 @@
 task: meta_ifeval
-dataset_path: json
+dataset_path: parquet
 dataset_kwargs:
-  data_files: ../joined_ifeval.json
+  data_files: ./work_dir/joined_ifeval.parquet
 output_type: generate_until
 test_split: train
 num_fewshot: 0

+ 0 - 46
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard.py

@@ -1,46 +0,0 @@
-from datasets import load_dataset,Dataset
-import os
-import yaml
-# def check_sample(example):
-#     if "kwargs" in example and not example["kwargs"]:
-#         print(example)
-#         raise ValueError("This example did not got ds for IFeval")
-#     if "solution" in example and not example["solution"]:
-#         print(example)
-#         raise ValueError("This example did not got ds for MATH_hard")
-def load_config(config_path: str = "./eval_config.yaml"):
-    # Read the YAML configuration file
-    with open(config_path, "r") as file:
-        config = yaml.safe_load(file)
-    return config
-# current_dir = os.getcwd()
-# print("current_dir",current_dir)
-# yaml = load_config(str(current_dir)+"/eval_config.yaml")
-# meta_dataset_name = yaml["evals_dataset"]
-# model_name = meta_dataset_name.split("/")[-1].replace("-evals","")
-# original_dataset_name = "lighteval/MATH-Hard"
-
-# meta_data = load_dataset(
-#     meta_dataset_name,
-#     name=f"{model_name}-evals__math_hard__details",
-#     split="latest"
-#     )
-# math_data = load_dataset(
-#     original_dataset_name,
-#     split="test"
-#     )
-# meta_df = meta_data.to_pandas()
-# math_df = math_data.to_pandas()
-# math_df = math_df.rename(columns={"problem": "input_question"})
-
-# joined = meta_df.join(math_df.set_index('input_question'),on="input_question")
-# ds = Dataset.from_pandas(joined)
-# ds = ds.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","solution","output_prediction_text"])
-# ds = ds.rename_column("is_correct","previous_is_correct")
-# ds = ds.rename_column("output_prediction_text","previous_output_prediction_text")
-from datasets import load_dataset
-current_dir = os.getcwd()
-print("current_dir",current_dir)
-yaml = load_config(str(current_dir)+"/eval_config.yaml")
-work_dir = yaml["work_dir"]
-load_dataset('parquet', data_files=str(current_dir)+"/"+work_dir+"/joined_math.parquet")

+ 1 - 6
tools/benchmarks/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml

@@ -1,9 +1,6 @@
 dataset_path: parquet
-dataset_name: null
 dataset_kwargs:
-  data_files:
-    train: /home/kaiwu/work/llama-recipes/tools/benchmarks/meta_eval_reproduce/work_dir/joined_math.parquet
-  trust_remote_code: true
+  data_files: ./work_dir/joined_math.parquet
 task: meta_math_hard
 process_docs: !function utils.process_docs
 output_type: generate_until
@@ -22,5 +19,3 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
-dataset_kwargs:
-  trust_remote_code: true