Browse Source

updated mmlu and harness

Justin Lee 2 months ago
parent
commit
423231e139

+ 1 - 0
end-to-end-use-cases/benchmarks/llm_eval_harness/.gitignore

@@ -1 +1,2 @@
 **/eval_results/**
+**/old_eval_results/**

+ 1 - 1
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml

@@ -1,4 +1,4 @@
 group: meta_instruct
 task:
-# - meta_mmlu
+# - meta_mmlu_instruct
 - meta_mmlu_pro_instruct

+ 20 - 5
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml

@@ -1,14 +1,29 @@
-task: meta_mmlu
+task: meta_mmlu_instruct
 dataset_path: meta-llama/Llama-3.3-70B-Instruct-evals
 dataset_name: Llama-3.3-70B-Instruct-evals__mmlu__0_shot__cot__details
 test_split: latest
-output_type: multiple_choice
+output_type: generate_until
 process_docs: !function utils.process_docs
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
-doc_to_choice: ["A", "B", "C", "D"]
-# 5-shot prompts are already included in the dataset
-# So no need to generate
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
 num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
 metadata:
   version: 1.0

File diff suppressed because it is too large
+ 4 - 8
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py


File diff suppressed because it is too large
+ 1 - 1
end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py


File diff suppressed because it is too large
+ 2433 - 1095
end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb