| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 | 
							- # ## Evaluating HumanEval Results using Modal Sandboxes
 
- # This script will take generated results and evaluate them.
 
- # We use Modal Sandboxes to safely evaluate LLM-generated results.
 
- #
 
- # Run it with:
 
- #    modal run eval
 
- from pathlib import Path
 
- import modal
 
- app = modal.App("many-llamas-human-eval")
 
- volume = modal.Volume.from_name("humaneval", create_if_missing=True)
 
- sandbox_image = (
 
-     modal.Image.debian_slim()
 
-     .apt_install("git")
 
-     .run_commands(
 
-         "git clone https://github.com/modal-labs/human-eval.git",
 
-         "pip install -e human-eval",
 
-     )
 
- )
 
- MINUTES = 60
 
- @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
 
- def eval_single_task(sample_file_path: str, problem_file_path: str):
 
-     with modal.Volume.ephemeral() as vol:
 
-         with vol.batch_upload() as batch:
 
-             batch.put_file(sample_file_path, "samples.jsonl")
 
-             batch.put_file(problem_file_path, "problems.jsonl")
 
-         print(f"Starting sandbox for {sample_file_path}")
 
-         sandbox = modal.Sandbox.create(
 
-             "bash",
 
-             "-c",
 
-             "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
 
-             image=sandbox_image,
 
-             volumes={"/vol": vol},
 
-             timeout=10 * MINUTES,
 
-             cpu=32,
 
-         )
 
-         try:
 
-             sandbox.wait()
 
-             print(f"Finished sandbox for {sample_file_path}")
 
-         except FunctionTimeoutError:
 
-             print("Sandbox timed out")
 
-         if sandbox.returncode == 0:
 
-             print(sandbox.stdout.read())
 
-             data = b""
 
-             for chunk in vol.read_file("samples.jsonl_results.jsonl"):
 
-                 data += chunk
 
-             with open(f"{sample_file_path}_results.jsonl", "wb") as f:
 
-                 f.write(data)
 
-         else:
 
-             print(f"Tests failed with code {sandbox.returncode}")
 
-             print(sandbox.stderr.read())
 
- @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
 
- def eval_all_tasks():
 
-     import os
 
-     volume.reload()
 
-     # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
 
-     envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
 
-     for env in envs:
 
-         print(f"looking in {env}")
 
-         problem_file = env / "data.jsonl"
 
-         pattern = "*/*.jsonl"
 
-         handles = []
 
-         for file_path in env.glob(pattern):            
 
-             # Skip files that end with _results.jsonl
 
-             if str(file_path).endswith("_results.jsonl"):
 
-                 continue
 
-             print(f"Checking {file_path}")
 
-             # Check if the corresponding results file exists
 
-             results_file = f"{file_path}_results.jsonl"
 
-             if not os.path.exists(results_file):
 
-                 # If it doesn't exist, run do_eval
 
-                 print("Spawning on", file_path, problem_file)
 
-                 handles.append(eval_single_task.spawn(file_path, problem_file))
 
-         for handle in handles:
 
-             handle.get()
 
- @app.local_entrypoint()
 
- def main():
 
-     eval_all_tasks.remote()
 
 
  |