12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- # ## Evaluating HumanEval Results using Modal Sandboxes
- # This script will take generated results and evaluate them.
- # We use Modal Sandboxes to safely evaluate LLM-generated results.
- #
- # Run it with:
- # modal run eval
- from pathlib import Path
- import modal
- app = modal.App("many-llamas-human-eval")
- volume = modal.Volume.from_name("humaneval", create_if_missing=True)
- sandbox_image = (
- modal.Image.debian_slim()
- .apt_install("git")
- .run_commands(
- "git clone https://github.com/modal-labs/human-eval.git",
- "pip install -e human-eval",
- )
- )
- MINUTES = 60
- @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
- def eval_single_task(sample_file_path: str, problem_file_path: str):
- with modal.Volume.ephemeral() as vol:
- with vol.batch_upload() as batch:
- batch.put_file(sample_file_path, "samples.jsonl")
- batch.put_file(problem_file_path, "problems.jsonl")
- print(f"Starting sandbox for {sample_file_path}")
- sandbox = modal.Sandbox.create(
- "bash",
- "-c",
- "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
- image=sandbox_image,
- volumes={"/vol": vol},
- timeout=10 * MINUTES,
- cpu=32,
- )
- try:
- sandbox.wait()
- print(f"Finished sandbox for {sample_file_path}")
- except FunctionTimeoutError:
- print("Sandbox timed out")
- if sandbox.returncode == 0:
- print(sandbox.stdout.read())
- data = b""
- for chunk in vol.read_file("samples.jsonl_results.jsonl"):
- data += chunk
- with open(f"{sample_file_path}_results.jsonl", "wb") as f:
- f.write(data)
- else:
- print(f"Tests failed with code {sandbox.returncode}")
- print(sandbox.stderr.read())
- @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
- def eval_all_tasks():
- import os
- volume.reload()
- # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
- envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
- for env in envs:
- print(f"looking in {env}")
- problem_file = env / "data.jsonl"
- pattern = "*/*.jsonl"
- handles = []
- for file_path in env.glob(pattern):
- # Skip files that end with _results.jsonl
- if str(file_path).endswith("_results.jsonl"):
- continue
- print(f"Checking {file_path}")
- # Check if the corresponding results file exists
- results_file = f"{file_path}_results.jsonl"
- if not os.path.exists(results_file):
- # If it doesn't exist, run do_eval
- print("Spawning on", file_path, problem_file)
- handles.append(eval_single_task.spawn(file_path, problem_file))
- for handle in handles:
- handle.get()
- @app.local_entrypoint()
- def main():
- eval_all_tasks.remote()
|