generate.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # ## Generating HumanEval Results with our Llama 3.2 3B Instruct Model
  2. # This app starts many parallel clients to send requests to the vLLM server.
  3. #
  4. # For each of the tasks in the HumanEval test set, we'll run a client to request 1000 completions.
  5. # Results are saved to our mounted volume.
  6. #
  7. # Run it with:
  8. # modal run generate --data-dir test --no-dry-run --n 1000 --subsample 100
  9. from datetime import datetime
  10. import json
  11. from pathlib import Path
  12. from dataclasses import dataclass, asdict
  13. import modal
  14. # This defines the image to use for running openai clients in parallel
  15. image = modal.Image.debian_slim(python_version="3.11").pip_install(
  16. "openai==1.38.0", "datasets==2.20.0"
  17. )
  18. app = modal.App("many-llamas-human-eval", image=image)
  19. volume = modal.Volume.from_name("humaneval", create_if_missing=True)
  20. DATA_DIR = Path("/mnt/humaneval")
  21. default_system_prompt = "Write the body for the Python function provided in the prompt below. Do not write anything else. Your output will be directly concatenated with the prompt and the resulting function executed against tests."
  22. MINUTES = 60 # seconds
  23. HOURS = 60 * MINUTES
  24. @dataclass
  25. class CompletionParams:
  26. model: str = None
  27. max_tokens: int = 1024
  28. temperature: float = 0.7
  29. top_p: float = 0.9
  30. frequency_penalty: float = 0
  31. presence_penalty: float = 0
  32. n: int = 1
  33. stop: str = None
  34. seed: int = None
  35. @dataclass
  36. class ClientParams:
  37. app_name: str = "many-llamas-human-eval"
  38. workspace: str = None
  39. api_key: str = "super-secret-token" # match the secret in inference.py
  40. @property
  41. def url(self):
  42. return f"https://{self.workspace}--{self.app_name}-serve.modal.run/v1"
  43. @app.local_entrypoint()
  44. def main(
  45. app_name: str = "many-llamas-human-eval",
  46. workspace: str = None,
  47. api_key: str = "super-secret-token",
  48. model: str = None,
  49. max_tokens: int = 1024,
  50. temperature: float = 0.7,
  51. top_p: float = 0.9,
  52. frequency_penalty: float = 0,
  53. presence_penalty: float = 0,
  54. n: int = 1,
  55. stop: str = None,
  56. seed: int = None,
  57. data_dir: str = "dev-llm",
  58. subsample: int = 1, # percent of the test split to read
  59. system_prompt: str = default_system_prompt,
  60. dry_run: bool = True,
  61. ):
  62. if workspace is None:
  63. workspace = modal.config._profile
  64. client_params = ClientParams(app_name, workspace, api_key)
  65. completion_params = CompletionParams(
  66. model=model,
  67. max_tokens=max_tokens,
  68. temperature=temperature,
  69. top_p=top_p,
  70. frequency_penalty=frequency_penalty,
  71. presence_penalty=presence_penalty,
  72. n=n,
  73. stop=stop,
  74. seed=seed,
  75. )
  76. # Run a remote download function to save the HumanEval dataset in the cloud volume
  77. save_dataset.remote(path=data_dir, subsample=subsample)
  78. # Run a remote generation function
  79. results = run_human_eval.remote(
  80. client_params=client_params,
  81. completion_params=completion_params,
  82. system_prompt=system_prompt,
  83. data_dir=data_dir,
  84. dry_run=dry_run,
  85. )
  86. if results:
  87. with open("/tmp/results.jsonl", "w") as f:
  88. f.writelines(json.dumps(result) + "\n" for result in results)
  89. print(f"results saved locally to {f.name}")
  90. # This is the parent function that spawns a client for each eval task
  91. @app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
  92. def run_human_eval(
  93. client_params: ClientParams,
  94. completion_params: CompletionParams,
  95. data_dir="dev-llm",
  96. system_prompt: str = default_system_prompt,
  97. dry_run=True,
  98. ):
  99. dataset = load_dataset(data_dir)
  100. timestamp = datetime.utcnow().isoformat() + "Z"
  101. output_dir = Path(DATA_DIR) / data_dir / f"run-{timestamp}"
  102. output_dir.mkdir(parents=True, exist_ok=True)
  103. handles = []
  104. print(f"Eval set contains {len(dataset)} items")
  105. # For each eval item in the dataset, spawn a parallel openAI client worker that generates n completions each
  106. print(Colors.BOLD, f"Spawning clients for each eval item. You may notice a brief wait while the inference server(s) boot.", Colors.END, sep="")
  107. for i, item in enumerate(dataset):
  108. handles.append(
  109. run_item.spawn(
  110. item,
  111. client_params,
  112. completion_params,
  113. system_prompt,
  114. output_dir,
  115. dry_run,
  116. )
  117. )
  118. for handle in handles:
  119. result = handle.get()
  120. if not dry_run:
  121. return result
  122. # This function is responsible for generating n completions for a single eval item
  123. # It calls into our deployed vLLM server and saves results to the cloud volume
  124. @app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
  125. def run_item(
  126. item: dict,
  127. client_params: ClientParams,
  128. completion_params: CompletionParams,
  129. system_prompt: str,
  130. output_dir: Path,
  131. dry_run: bool,
  132. ):
  133. client = create_client(client_params)
  134. if not completion_params.model:
  135. model = client.models.list().data[0]
  136. model = model.id
  137. completion_params.model = model
  138. prompt = item["prompt"]
  139. messages = [
  140. {"role": "system", "content": system_prompt},
  141. {"role": "user", "content": prompt},
  142. ]
  143. per_request = 250
  144. ct, completions = completion_params.n, []
  145. if not dry_run:
  146. while ct > 0:
  147. response = get_completion(
  148. client,
  149. messages=messages,
  150. **asdict(completion_params) | dict(n=min(ct, per_request)),
  151. )
  152. if response:
  153. completions += [
  154. {
  155. "task_id": item["task_id"],
  156. "completion": choice.message.content,
  157. }
  158. for choice in response.choices
  159. ]
  160. ct -= per_request
  161. index = item["task_id"].split("/")[-1]
  162. output_path = output_dir / f"{index}.jsonl"
  163. output_path.parent.mkdir(parents=True, exist_ok=True)
  164. with open(output_path, "w") as f:
  165. f.writelines(json.dumps(completion) + "\n" for completion in completions)
  166. print(Colors.GREEN + f"Completions saved to {output_path}" + Colors.END)
  167. class Colors:
  168. """ANSI color codes"""
  169. GREEN = "\033[0;32m"
  170. RED = "\033[0;31m"
  171. BLUE = "\033[0;34m"
  172. GRAY = "\033[0;90m"
  173. BOLD = "\033[1m"
  174. END = "\033[0m"
  175. def get_completion(client, **kwargs):
  176. try:
  177. response = client.chat.completions.create(**kwargs)
  178. return response
  179. except Exception as e:
  180. print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
  181. return None
  182. def create_client(client_params: ClientParams):
  183. from openai import OpenAI
  184. client = OpenAI(api_key=client_params.api_key)
  185. client.base_url = client_params.url
  186. return client
  187. # This function downloads the HumanEval dataset
  188. @app.function(volumes={DATA_DIR: volume})
  189. def save_dataset(path="dev-llm", subsample: int = 1):
  190. import datasets
  191. path = DATA_DIR / path
  192. ds = datasets.load_dataset(
  193. "openai/openai_humaneval",
  194. # reads 0% to subsample% of the test split
  195. split=datasets.ReadInstruction("test", to=subsample, unit="%"),
  196. )
  197. ds.to_json(path / "data.jsonl")
  198. volume.commit()
  199. def load_dataset(path="dev-llm"):
  200. import datasets
  201. path = DATA_DIR / path
  202. ds = datasets.load_dataset(path=str(path), data_files="data.jsonl")
  203. return ds["train"]