Parcourir la source

Updates to accommodate OpenLLM leaderboard v2 tasks and change Meta Llama 3.1 to Llama 3.1 (#639)

Kai Wu il y a 7 mois
Parent
commit
2501f519c7
26 fichiers modifiés avec 144 ajouts et 434 suppressions
  1. 7 0
      .github/scripts/spellcheck_conf/wordlist.txt
  2. 1 1
      tools/benchmarks/README.md
  3. 86 58
      tools/benchmarks/llm_eval_harness/README.md
  4. 0 229
      tools/benchmarks/llm_eval_harness/eval.py
  5. 24 44
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
  6. 3 3
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
  7. 2 2
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
  8. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/bbh/utils.py
  9. 2 2
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
  10. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/utils.py
  11. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/ifeval/ifeval.yaml
  12. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/ifeval/utils.py
  13. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_hard_0shot_cot.yaml
  14. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/utils.py
  15. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/meta_instruct.yaml
  16. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/meta_pretrain.yaml
  17. 2 2
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
  18. 2 2
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
  19. 0 0
      tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/utils.py
  20. 15 15
      tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
  21. 0 25
      tools/benchmarks/llm_eval_harness/open_llm_eval_prep.sh
  22. 0 6
      tools/benchmarks/llm_eval_harness/open_llm_leaderboard/arc_challeneg_25shots.yaml
  23. 0 6
      tools/benchmarks/llm_eval_harness/open_llm_leaderboard/hellaswag_10shots.yaml
  24. 0 24
      tools/benchmarks/llm_eval_harness/open_llm_leaderboard/hellaswag_utils.py
  25. 0 9
      tools/benchmarks/llm_eval_harness/open_llm_leaderboard/mmlu_5shots.yaml
  26. 0 6
      tools/benchmarks/llm_eval_harness/open_llm_leaderboard/winogrande_5shots.yaml

+ 7 - 0
.github/scripts/spellcheck_conf/wordlist.txt

@@ -1451,6 +1451,13 @@ openhathi
 sarvam
 sarvam
 subtask
 subtask
 acc
 acc
+BigBench
+IFEval
+MuSR
+Multistep
+multistep
+algorithmically
+asymptote
 Triaging
 Triaging
 matplotlib
 matplotlib
 remediations
 remediations

+ 1 - 1
tools/benchmarks/README.md

@@ -1,4 +1,4 @@
 # Benchmarks
 # Benchmarks
 
 
 * inference - a folder contains benchmark scripts that apply a throughput analysis for Llama models inference on various backends including on-prem, cloud and on-device.
 * inference - a folder contains benchmark scripts that apply a throughput analysis for Llama models inference on various backends including on-prem, cloud and on-device.
-* llm_eval_harness - a folder contains a tool to evaluate fine-tuned Llama models including quantized models focusing on quality.  
+* llm_eval_harness - a folder that introduces `lm-evaluation-harness`, a tool to evaluate Llama models including quantized models focusing on quality. We also included a recipe that calculates Llama 3.1 evaluation metrics Using `lm-evaluation-harness` and instructions that calculate HuggingFace Open LLM Leaderboard v2 metrics.

Fichier diff supprimé car celui-ci est trop grand
+ 86 - 58
tools/benchmarks/llm_eval_harness/README.md


+ 0 - 229
tools/benchmarks/llm_eval_harness/eval.py

@@ -1,229 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-import argparse
-import json
-import logging
-import os
-import re
-import sys
-from pathlib import Path
-
-import numpy as np
-import lm_eval
-from lm_eval import tasks
-from lm_eval.utils import make_table
-
-
-def _handle_non_serializable(o):
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
-
-
-def setup_logging(verbosity):
-    logging.basicConfig(
-        level=verbosity.upper(), format="%(asctime)s - %(levelname)s - %(message)s"
-    )
-    return logging.getLogger(__name__)
-
-
-def handle_output(args, results, logger):
-    if not args.output_path:
-        if args.log_samples:
-            logger.error("Specify --output_path for logging samples.")
-            sys.exit(1)
-        logger.info(json.dumps(results, indent=2, default=_handle_non_serializable))
-        return
-
-    path = Path(args.output_path)
-    if path.is_file() or path.with_name("results.json").is_file():
-        logger.warning(f"File already exists at {path}. Results will be overwritten.")
-
-    output_dir = path.parent if path.suffix in (".json", ".jsonl") else path
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    results_str = json.dumps(results, indent=2, default=_handle_non_serializable)
-    if args.show_config:
-        logger.info(results_str)
-
-    file_path = os.path.join(args.output_path, "results.json")
-    with open(file_path , "w", encoding="utf-8") as f:
-        f.write(results_str)
-
-    if args.log_samples:
-        samples = results.pop("samples", {})
-        for task_name, _ in results.get("configs", {}).items():
-            output_name = re.sub(r"/|=", "__", args.model_args) + "_" + task_name
-            sample_file = output_dir.joinpath(f"{output_name}.jsonl")
-            sample_data = json.dumps(
-                samples.get(task_name, {}), indent=2, default=_handle_non_serializable
-            )
-            sample_file.write_text(sample_data, encoding="utf-8")
-
-    batch_sizes = ",".join(map(str, results.get("config", {}).get("batch_sizes", [])))
-    summary = f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
-    logger.info(summary)
-    logger.info(make_table(results))
-    if "groups" in results:
-        logger.info(make_table(results, "groups"))
-
-
-def load_tasks(args):
-    if args.open_llm_leaderboard_tasks:
-        current_dir = os.getcwd()
-        config_dir = os.path.join(current_dir, "open_llm_leaderboard")
-        task_manager = tasks.TaskManager(include_path=config_dir)
-        return task_manager, [
-            "arc_challenge_25_shot",
-            "hellaswag_10_shot",
-            "truthfulqa_mc2",
-            "winogrande_5_shot",
-            "gsm8k",
-            "mmlu",
-        ]
-    return None, args.tasks.split(",") if args.tasks else []
-
-
-def parse_eval_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument(
-        "--model", "-m", default="hf", help="Name of model, e.g., `hf`."
-    )
-    parser.add_argument(
-        "--tasks",
-        "-t",
-        default=None,
-        help="Comma-separated list of tasks, or 'list' to display available tasks.",
-    )
-    parser.add_argument(
-        "--model_args",
-        "-a",
-        default="",
-        help="Comma-separated string arguments for model, e.g., `pretrained=EleutherAI/pythia-160m`.",
-    )
-    parser.add_argument(
-        "--open_llm_leaderboard_tasks",
-        "-oplm",
-        action="store_true",
-        default=False,
-        help="Choose the list of tasks with specification in HF open LLM-leaderboard.",
-    )
-    parser.add_argument(
-        "--num_fewshot",
-        "-f",
-        type=int,
-        default=None,
-        help="Number of examples in few-shot context.",
-    )
-    parser.add_argument(
-        "--batch_size",
-        "-b",
-        default=1,
-        help="Batch size, can be 'auto', 'auto:N', or an integer.",
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=None,
-        help="Maximal batch size with 'auto' batch size.",
-    )
-    parser.add_argument(
-        "--device", default=None, help="Device for evaluation, e.g., 'cuda', 'cpu'."
-    )
-    parser.add_argument(
-        "--output_path", "-o", type=str, default=None, help="Path for saving results."
-    )
-    parser.add_argument(
-        "--limit",
-        "-L",
-        type=float,
-        default=None,
-        help="Limit number of examples per task.",
-    )
-    parser.add_argument(
-        "--use_cache", "-c", default=None, help="Path to cache db file, if used."
-    )
-    parser.add_argument(
-        "--verbosity",
-        "-v",
-        default="INFO",
-        help="Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.",
-    )
-    parser.add_argument(
-        "--gen_kwargs",
-        default=None,
-        help="Generation kwargs for tasks that support it.",
-    )
-    parser.add_argument(
-        "--check_integrity",
-        action="store_true",
-        help="Whether to run the relevant part of the test suite for the tasks.",
-    )
-    parser.add_argument(
-        "--write_out",
-        "-w",
-        action="store_true",
-        default=False,
-        help="Prints the prompt for the first few documents.",
-    )
-    parser.add_argument(
-        "--log_samples",
-        "-s",
-        action="store_true",
-        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.",
-    )
-    parser.add_argument(
-        "--show_config",
-        action="store_true",
-        default=False,
-        help="If True, shows the full config of all tasks at the end of the evaluation.",
-    )
-    parser.add_argument(
-        "--include_path",
-        type=str,
-        default=None,
-        help="Additional path to include if there are external tasks.",
-    )
-    return parser.parse_args()
-
-
-def evaluate_model(args):
-    try:
-        task_manager, task_list = load_tasks(args)
-        # Customized model such as Quantized model etc.
-        # In case you are working with a custom model, you can use the following guide to add it here:
-        # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
-
-        # Evaluate
-        results = lm_eval.simple_evaluate(
-            model=args.model,
-            model_args=args.model_args,
-            tasks=task_list,
-            num_fewshot=args.num_fewshot,
-            batch_size=args.batch_size,
-            max_batch_size=args.max_batch_size,
-            device=args.device,
-            use_cache=args.use_cache,
-            limit=args.limit,
-            check_integrity=args.check_integrity,
-            write_out=args.write_out,
-            log_samples=args.log_samples,
-            gen_kwargs=args.gen_kwargs,
-            task_manager=task_manager,
-        )
-        handle_output(args, results, logger)
-
-    except Exception as e:
-        logger.error(f"An error occurred during evaluation: {e}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    args = parse_eval_args()
-    logger = setup_logging(args.verbosity)
-    evaluate_model(args)

Fichier diff supprimé car celui-ci est trop grand
+ 24 - 44
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md


+ 3 - 3
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml

@@ -1,7 +1,7 @@
-model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
 
-evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
-# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals"]
 
 
 tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
 # Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml

@@ -1,5 +1,5 @@
-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
-dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
+dataset_path: meta-llama/Llama-3.1-8B-evals
+dataset_name: Llama-3.1-8B-evals__bbh__details
 task: meta_bbh
 task: meta_bbh
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/bbh/utils.py


+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml

@@ -1,5 +1,5 @@
-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__gpqa__details
 task: meta_gpqa
 task: meta_gpqa
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/gpqa_cot/utils.py


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/ifeval/ifeval.yaml


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/ifeval/utils.py


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/math_hard_0shot_cot.yaml


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/math_hard/utils.py


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/meta_instruct.yaml


tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/meta_pretrain.yaml


+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml

@@ -1,6 +1,6 @@
 task: meta_mmlu_pro_instruct
 task: meta_mmlu_pro_instruct
-dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
-dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
 test_split: latest
 test_split: latest
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

+ 2 - 2
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml

@@ -1,6 +1,6 @@
 task: meta_mmlu_pro_pretrain
 task: meta_mmlu_pro_pretrain
-dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
-dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
+dataset_path: meta-llama/Llama-3.1-8B-evals
+dataset_name: Llama-3.1-8B-evals__mmlu_pro__details
 test_split: latest
 test_split: latest
 output_type: generate_until
 output_type: generate_until
 process_docs: !function utils.process_docs
 process_docs: !function utils.process_docs

tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py → tools/benchmarks/llm_eval_harness/meta_eval/meta_template/mmlu_pro/utils.py


+ 15 - 15
tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py

@@ -16,12 +16,12 @@ from datasets import Dataset, load_dataset
 def get_ifeval_data(model_name, output_dir):
 def get_ifeval_data(model_name, output_dir):
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     print(f"preparing the ifeval data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
-        "Meta-Llama-3.1-8B-Instruct",
-        "Meta-Llama-3.1-70B-Instruct",
-        "Meta-Llama-3.1-405B-Instruct",
+        "Llama-3.1-8B-Instruct",
+        "Llama-3.1-70B-Instruct",
+        "Llama-3.1-405B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for IFEval"
         )
         )
     original_dataset_name = "wis-k/instruction-following-eval"
     original_dataset_name = "wis-k/instruction-following-eval"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -59,12 +59,12 @@ def get_ifeval_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
 def get_math_data(model_name, output_dir):
     print(f"preparing the math data using {model_name}'s evals dataset")
     print(f"preparing the math data using {model_name}'s evals dataset")
     if model_name not in [
     if model_name not in [
-        "Meta-Llama-3.1-8B-Instruct",
-        "Meta-Llama-3.1-70B-Instruct",
-        "Meta-Llama-3.1-405B-Instruct",
+        "Llama-3.1-8B-Instruct",
+        "Llama-3.1-70B-Instruct",
+        "Llama-3.1-405B-Instruct",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
-            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard"
+            "Only Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Llama-3.1-405B-Instruct models are supported for MATH_hard"
         )
         )
     original_dataset_name = "lighteval/MATH-Hard"
     original_dataset_name = "lighteval/MATH-Hard"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
     meta_dataset_name = f"meta-llama/{model_name}-evals"
@@ -130,7 +130,7 @@ def change_yaml(args, base_name):
         with open(output_path, "w") as output:
         with open(output_path, "w") as output:
             for line in lines:
             for line in lines:
                 output.write(
                 output.write(
-                    line.replace("Meta-Llama-3.1-8B", base_name).replace(
+                    line.replace("Llama-3.1-8B", base_name).replace(
                         "WORK_DIR", str(yaml_dir)
                         "WORK_DIR", str(yaml_dir)
                     )
                     )
                 )
                 )
@@ -208,12 +208,12 @@ if __name__ == "__main__":
     if not os.path.exists(args.template_dir):
     if not os.path.exists(args.template_dir):
         raise ValueError("The template_dir does not exist, please check the path")
         raise ValueError("The template_dir does not exist, please check the path")
     if args.evals_dataset not in [
     if args.evals_dataset not in [
-        "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-70B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-405B-Instruct-evals",
-        "meta-llama/Meta-Llama-3.1-8B-evals",
-        "meta-llama/Meta-Llama-3.1-70B-evals",
-        "meta-llama/Meta-Llama-3.1-405B-evals",
+        "meta-llama/Llama-3.1-8B-Instruct-evals",
+        "meta-llama/Llama-3.1-70B-Instruct-evals",
+        "meta-llama/Llama-3.1-405B-Instruct-evals",
+        "meta-llama/Llama-3.1-8B-evals",
+        "meta-llama/Llama-3.1-70B-evals",
+        "meta-llama/Llama-3.1-405B-evals",
     ]:
     ]:
         raise ValueError(
         raise ValueError(
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"
             "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"

+ 0 - 25
tools/benchmarks/llm_eval_harness/open_llm_eval_prep.sh

@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-#!/bin/bash
-
-# Prompt the user for the EVAL_PATH
-read -p "Enter the asbolute path to the lm-evaluation-harness: " EVAL_PATH
-conda activate 
-# Directory containing YAML files
-DIR="open_llm_leaderboard"
-
-# Check if the directory exists
-if [ ! -d "$DIR" ]; then
-    echo "Error: Directory '$DIR' not found."
-    exit 1
-fi
-
-# Iterate over YAML files in the directory and update them
-for YAML_FILE in "$DIR"/*.yaml
-do
-    if [ -f "$YAML_FILE" ]; then
-        sed -i 's|{\$EVAL_PATH}|'"$EVAL_PATH"'|g' "$YAML_FILE"
-        echo "Updated $YAML_FILE with EVAL_PATH: $EVAL_PATH"
-    fi
-done

+ 0 - 6
tools/benchmarks/llm_eval_harness/open_llm_leaderboard/arc_challeneg_25shots.yaml

@@ -1,6 +0,0 @@
-include: {$EVAL_PATH}/lm_eval/tasks/arc/arc_challenge.yaml
-task: arc_challenge_25_shot
-task_alias: arc 25 shot
-num_fewshot: 25
-metric_list:
-  - metric: acc_norm

+ 0 - 6
tools/benchmarks/llm_eval_harness/open_llm_leaderboard/hellaswag_10shots.yaml

@@ -1,6 +0,0 @@
-include: {$EVAL_PATH}/lm_eval/tasks/hellaswag/hellaswag.yaml
-task: hellaswag_10_shot
-task_alias: hellaswag 10 shot
-num_fewshot: 10
-metric_list:
-  - metric: acc_norm

+ 0 - 24
tools/benchmarks/llm_eval_harness/open_llm_leaderboard/hellaswag_utils.py

@@ -1,24 +0,0 @@
-import datasets
-import re
-
-
-def preprocess(text):
-    text = text.strip()
-    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-    text = text.replace(" [title]", ". ")
-    text = re.sub("\\[.*?\\]", "", text)
-    text = text.replace("  ", " ")
-    return text
-
-
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    def _process_doc(doc):
-        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
-        out_doc = {
-            "query": preprocess(doc["activity_label"] + ": " + ctx),
-            "choices": [preprocess(ending) for ending in doc["endings"]],
-            "gold": int(doc["label"]),
-        }
-        return out_doc
-
-    return dataset.map(_process_doc)

+ 0 - 9
tools/benchmarks/llm_eval_harness/open_llm_leaderboard/mmlu_5shots.yaml

@@ -1,9 +0,0 @@
-include: {$EVAL_PATH}/lm_eval/tasks/mmlu/default/_mmlu.yaml
-task:
-  - mmlu_stem
-  - mmlu_other
-  - mmlu_social_sciences
-  - mmlu_humanities
-num_fewshot: 5
-metric_list:
-  - metric: acc

+ 0 - 6
tools/benchmarks/llm_eval_harness/open_llm_leaderboard/winogrande_5shots.yaml

@@ -1,6 +0,0 @@
-include: {$EVAL_PATH}/lm_eval/tasks/winogrande/default.yaml
-task: winogrande_5_shot
-task_alias: winogrande 5 shot
-num_fewshot: 5
-metric_list:
-  - metric: acc