1 年之前 · 576e574e31
--- a/tools/benchmarks/llm_eval_harness/README.md
+++ b/tools/benchmarks/llm_eval_harness/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py