eval.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # ## Evaluating HumanEval Results using Modal Sandboxes
  2. # This script will take generated results and evaluate them.
  3. # We use Modal Sandboxes to safely evaluate LLM-generated results.
  4. #
  5. # Run it with:
  6. # modal run eval
  7. from pathlib import Path
  8. import modal
  9. app = modal.App("many-llamas-human-eval")
  10. volume = modal.Volume.from_name("humaneval", create_if_missing=True)
  11. sandbox_image = (
  12. modal.Image.debian_slim()
  13. .apt_install("git")
  14. .run_commands(
  15. "git clone https://github.com/modal-labs/human-eval.git",
  16. "pip install -e human-eval",
  17. )
  18. )
  19. MINUTES = 60
  20. @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
  21. def eval_single_task(sample_file_path: str, problem_file_path: str):
  22. with modal.Volume.ephemeral() as vol:
  23. with vol.batch_upload() as batch:
  24. batch.put_file(sample_file_path, "samples.jsonl")
  25. batch.put_file(problem_file_path, "problems.jsonl")
  26. print(f"Starting sandbox for {sample_file_path}")
  27. sandbox = modal.Sandbox.create(
  28. "bash",
  29. "-c",
  30. "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
  31. image=sandbox_image,
  32. volumes={"/vol": vol},
  33. timeout=10 * MINUTES,
  34. cpu=32,
  35. )
  36. try:
  37. sandbox.wait()
  38. print(f"Finished sandbox for {sample_file_path}")
  39. except FunctionTimeoutError:
  40. print("Sandbox timed out")
  41. if sandbox.returncode == 0:
  42. print(sandbox.stdout.read())
  43. data = b""
  44. for chunk in vol.read_file("samples.jsonl_results.jsonl"):
  45. data += chunk
  46. with open(f"{sample_file_path}_results.jsonl", "wb") as f:
  47. f.write(data)
  48. else:
  49. print(f"Tests failed with code {sandbox.returncode}")
  50. print(sandbox.stderr.read())
  51. @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
  52. def eval_all_tasks():
  53. import os
  54. volume.reload()
  55. # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
  56. envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
  57. for env in envs:
  58. print(f"looking in {env}")
  59. problem_file = env / "data.jsonl"
  60. pattern = "*/*.jsonl"
  61. handles = []
  62. for file_path in env.glob(pattern):
  63. # Skip files that end with _results.jsonl
  64. if str(file_path).endswith("_results.jsonl"):
  65. continue
  66. print(f"Checking {file_path}")
  67. # Check if the corresponding results file exists
  68. results_file = f"{file_path}_results.jsonl"
  69. if not os.path.exists(results_file):
  70. # If it doesn't exist, run do_eval
  71. print("Spawning on", file_path, problem_file)
  72. handles.append(eval_single_task.spawn(file_path, problem_file))
  73. for handle in handles:
  74. handle.get()
  75. @app.local_entrypoint()
  76. def main():
  77. eval_all_tasks.remote()