eval.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. from pathlib import Path
  2. import modal
  3. app = modal.App("many-llamas-human-eval")
  4. volume = modal.Volume.from_name("humaneval", create_if_missing=True)
  5. sandbox_image = (
  6. modal.Image.debian_slim()
  7. .apt_install("git")
  8. .run_commands(
  9. "git clone https://github.com/modal-labs/human-eval.git",
  10. "pip install -e human-eval",
  11. )
  12. )
  13. MINUTES = 60
  14. @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
  15. def eval_single_task(sample_file_path: str, problem_file_path: str):
  16. with modal.Volume.ephemeral() as vol:
  17. with vol.batch_upload() as batch:
  18. batch.put_file(sample_file_path, "samples.jsonl")
  19. batch.put_file(problem_file_path, "problems.jsonl")
  20. print(f"Starting sandbox for {sample_file_path}")
  21. sandbox = modal.Sandbox.create(
  22. "bash",
  23. "-c",
  24. "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
  25. image=sandbox_image,
  26. volumes={"/vol": vol},
  27. timeout=10 * MINUTES,
  28. cpu=32,
  29. )
  30. try:
  31. sandbox.wait()
  32. print(f"Finished sandbox for {sample_file_path}")
  33. except FunctionTimeoutError:
  34. print("Sandbox timed out")
  35. if sandbox.returncode == 0:
  36. print(sandbox.stdout.read())
  37. data = b""
  38. for chunk in vol.read_file("samples.jsonl_results.jsonl"):
  39. data += chunk
  40. with open(f"{sample_file_path}_results.jsonl", "wb") as f:
  41. f.write(data)
  42. else:
  43. print(f"Tests failed with code {sandbox.returncode}")
  44. print(sandbox.stderr.read())
  45. @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
  46. def eval_all_tasks():
  47. import os
  48. volume.reload()
  49. # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
  50. envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
  51. for env in envs:
  52. print(f"looking in {env}")
  53. problem_file = env / "data.jsonl"
  54. pattern = "*/*.jsonl"
  55. handles = []
  56. for file_path in env.glob(pattern):
  57. # Skip files that end with _results.jsonl
  58. if str(file_path).endswith("_results.jsonl"):
  59. continue
  60. print(f"Checking {file_path}")
  61. # Check if the corresponding results file exists
  62. results_file = f"{file_path}_results.jsonl"
  63. if not os.path.exists(results_file):
  64. # If it doesn't exist, run do_eval
  65. print("Spawning on", file_path, problem_file)
  66. handles.append(eval_single_task.spawn(file_path, problem_file))
  67. for handle in handles:
  68. handle.get()
  69. @app.local_entrypoint()
  70. def main():
  71. eval_all_tasks.remote()