@@ -1,3 +1,10 @@
+# ## Evaluating HumanEval Results using Modal Sandboxes
+# This script will take generated results and evaluate them.
+# We use Modal Sandboxes to safely evaluate LLM-generated results.
+#
+# Run it with:
+# modal run eval
+
from pathlib import Path
import modal
@@ -1,3 +1,9 @@
+# ## Plotting HumanEval Results
+# This script will calculate pass@k and fail@k for our experiment and plot them.
+# modal run plot
import io
import json