1 year ago · 307510b8c5
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1443,3 +1443,6 @@ ifeval
 
				 lighteval
			
 
				 sqrt
			
 
				 wis
			
 
				+evals
			
 
				+mmlu
			
 
				+parsers
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,7 +29,3 @@ langchain
 
				 langchain_community
			
 
				 sentence_transformers
			
 
				 codeshield
			
 
				-lm-eval==0.4.3
			
 
				-immutabledict
			
 
				-antlr4-python3-runtime==4.11
			
 
				-nltk=3.8.1
			
--- a/tools/benchmarks/llm_eval_harness/README.md
+++ b/tools/benchmarks/llm_eval_harness/README.md
--- a/tools/benchmarks/meta_eval_reproduce/README.md
+++ b/tools/benchmarks/meta_eval_reproduce/README.md
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_eval.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_eval.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
--- a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_dataset.py
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_dataset.py