prompt.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. '''
  2. Generate prompts for the LLM Needle Haystack.
  3. Source code from:
  4. https://github.com/gkamradt/LLMTest_NeedleInAHaystack/tree/main
  5. https://github.com/THUDM/LongAlign/tree/main/Needle_test
  6. '''
  7. from dotenv import load_dotenv
  8. import os
  9. import tiktoken
  10. import glob
  11. import json
  12. import yaml
  13. import argparse
  14. from anthropic import Anthropic
  15. import numpy as np
  16. import asyncio
  17. from asyncio import Semaphore
  18. from transformers import AutoTokenizer
  19. load_dotenv()
  20. class Prompter:
  21. """
  22. This class is used to test the LLM Needle Haystack.
  23. """
  24. def __init__(self,
  25. needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
  26. haystack_dir="PaulGrahamEssays",
  27. retrieval_question="What is the best thing to do in San Francisco?",
  28. context_lengths_min = 1000,
  29. context_lengths_max = 200000,
  30. context_lengths_num_intervals = 35,
  31. context_lengths = None,
  32. document_depth_percent_min = 0,
  33. document_depth_percent_max = 100,
  34. document_depth_percent_intervals = 35,
  35. document_depth_percents = None,
  36. document_depth_percent_interval_type = "linear",
  37. tokenizer_type = "OpenAI",
  38. model_name = "gpt-4-1106-preview",
  39. num_concurrent_requests = 1,
  40. final_context_length_buffer = 200,
  41. save_dir = "prompts",
  42. print_ongoing_status = True):
  43. """
  44. :param needle: The needle to be found in the haystack. Default is None.
  45. :param haystack_dir: The directory of text files to use as background context (or a haystack) in which the needle is to be found. Default is Paul Graham Essays.
  46. :param retrieval_question: The question which with to prompt the model to do the retrieval.
  47. :param results_version: In case you would like to try the same combination of model, context length, and depth % multiple times, change the results version other than 1
  48. :param num_concurrent_requests: Due to volume, this object is set up to run concurrent requests, default = 1. Be careful of rate limits.
  49. :param save_results: Whether or not you would like to save your contexts to file. Warning: These will get long! Default = True
  50. :param save_contexts: Whether or not you would like to save your contexts to file. Warning: These will get long! Default is True.
  51. :param final_context_length_buffer: The amount of cushion you'd like to leave off the input context to allow for the output context. Default 200 tokens
  52. :param context_lengths_min: The minimum length of the context. Default is 1000.
  53. :param context_lengths_max: The maximum length of the context. Default is 200000.
  54. :param context_lengths_num_intervals: The number of intervals for the context length. Default is 35.
  55. :param context_lengths: The lengths of the context. Default is None.
  56. :param document_depth_percent_min: The minimum depth percent of the document. Default is 0.
  57. :param document_depth_percent_max: The maximum depth percent of the document. Default is 100.
  58. :param document_depth_percent_intervals: The number of intervals for the document depth percent. Default is 35.
  59. :param document_depth_percents: The depth percentages of the document. Default is None.
  60. :param document_depth_percent_interval_type: The type of interval for the document depth percent. Must be either 'linear' or 'sigmoid'. Default is 'linear'.
  61. :param model_provider: The provider of the model. Must be either 'OpenAI' or 'Anthropic'. Default is 'OpenAI'.
  62. :param openai_api_key: The API key for OpenAI. Default is None.
  63. :param anthropic_api_key: The API key for Anthropic. Default is None.
  64. :param model_name: The name of the model. Default is 'gpt-4-1106-preview'.
  65. :param seconds_to_sleep_between_completions: The number of seconds to sleep between completions. Default is None.
  66. :param print_ongoing_status: Whether or not to print the ongoing status. Default is True.
  67. """
  68. if not needle or not haystack_dir or not retrieval_question:
  69. raise ValueError("Needle, haystack, and retrieval_question must be provided.")
  70. self.needle = needle
  71. self.haystack_dir = haystack_dir
  72. self.retrieval_question = retrieval_question
  73. self.num_concurrent_requests = num_concurrent_requests
  74. self.final_context_length_buffer = final_context_length_buffer
  75. self.print_ongoing_status = print_ongoing_status
  76. self.tokenizer_type = tokenizer_type
  77. self.model_name = model_name
  78. self.testing_results = []
  79. if context_lengths is None:
  80. if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None:
  81. raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.")
  82. else:
  83. self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int)
  84. else:
  85. self.context_lengths = context_lengths
  86. if document_depth_percents is None:
  87. if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None:
  88. raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.")
  89. else:
  90. if document_depth_percent_interval_type == 'linear':
  91. self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int)
  92. elif document_depth_percent_interval_type == 'sigmoid':
  93. self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)]
  94. else:
  95. self.document_depth_percents = document_depth_percents
  96. if document_depth_percent_interval_type not in [None, "linear", "sigmoid"]:
  97. raise ValueError("document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals")
  98. if self.tokenizer_type == "OpenAI":
  99. assert self.model_name is not None, "If you're using OpenAI, you must provide a model name."
  100. self.enc = tiktoken.encoding_for_model(self.model_name)
  101. elif self.tokenizer_type == "Anthropic":
  102. self.enc = Anthropic().get_tokenizer()
  103. elif self.tokenizer_type == "Huggingface":
  104. self.enc = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True, use_fast=False)
  105. else:
  106. raise ValueError("tokenizer_type is not supported. Must be either 'tiktoken', 'Anthropic', or 'Huggingface'")
  107. self.save_dir = save_dir
  108. def logistic(self, x, L=100, x0=50, k=.1):
  109. if x == 0:
  110. return 0
  111. if x == 100:
  112. return 100
  113. return np.round(L / (1 + np.exp(-k * (x - x0))), 3)
  114. async def bound_evaluate_and_log(self, sem, *args):
  115. async with sem:
  116. await self.evaluate_and_log(*args)
  117. async def run_test(self):
  118. sem = Semaphore(self.num_concurrent_requests)
  119. # Run through each iteration of context_lengths and depths
  120. tasks = []
  121. for context_length in self.context_lengths:
  122. for depth_percent in self.document_depth_percents:
  123. task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
  124. tasks.append(task)
  125. # Wait for all tasks to complete
  126. await asyncio.gather(*tasks)
  127. def generate_prompt(self, context):
  128. return [
  129. {
  130. "role": "system",
  131. "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
  132. },
  133. {
  134. "role": "user",
  135. "content": context + '\n\n' + f"Read the document and answer: {self.retrieval_question}"
  136. },
  137. ]
  138. async def evaluate_and_log(self, context_length, depth_percent):
  139. # Checks to see if you've already checked a length/percent/version.
  140. # This helps if the program stop running and you want to restart later
  141. # Go generate the required length context and place your needle statement in
  142. context = await self.generate_context(context_length, depth_percent)
  143. print('Generate for context length:', context_length, 'depth percent:', depth_percent)
  144. # Prepare your message to send to the model you're going to evaluate
  145. prompt = self.generate_prompt(context)
  146. context_file_location = f'{self.tokenizer_type}_len_{context_length}_depth_{int(depth_percent*100)}'
  147. # Save the prompts to file for retesting
  148. if not os.path.exists(self.save_dir):
  149. os.makedirs(self.save_dir)
  150. # Save the result to file for retesting
  151. with open(f'{self.save_dir}/{context_file_location}_prompts.json', 'w') as f:
  152. json.dump(prompt, f)
  153. async def generate_context(self, context_length, depth_percent):
  154. # Load up tiktoken so we navigate tokens more easily
  155. # Get your Paul Graham files loaded into a string
  156. context = self.read_context_files()
  157. # Truncate the Paul Graham essays to the context length you desire
  158. context = self.encode_and_trim(context, context_length)
  159. # Insert your random statement according to your depth percent
  160. context = self.insert_needle(context, depth_percent, context_length)
  161. return context
  162. def encode_text_to_tokens(self, text):
  163. if self.tokenizer_type == "OpenAI":
  164. return self.enc.encode(text)
  165. elif self.tokenizer_type == "Anthropic":
  166. # Assuming you have a different encoder for Anthropic
  167. return self.enc.encode(text).ids
  168. elif self.tokenizer_type == "Huggingface":
  169. return self.enc(text, truncation=False, return_tensors="pt", add_special_tokens=False).input_ids.view(-1).tolist()
  170. else:
  171. raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
  172. def insert_needle(self, context, depth_percent, context_length):
  173. tokens_needle = self.encode_text_to_tokens(self.needle)
  174. tokens_context = self.encode_text_to_tokens(context)
  175. # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
  176. context_length -= self.final_context_length_buffer
  177. # If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
  178. if len(tokens_context) + len(tokens_needle) > context_length:
  179. tokens_context = tokens_context[:context_length - len(tokens_needle)]
  180. if depth_percent == 100:
  181. # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
  182. tokens_new_context = tokens_context + tokens_needle
  183. else:
  184. # Go get the position (in terms of tokens) to insert your needle
  185. insertion_point = int(len(tokens_context) * (depth_percent / 100))
  186. # tokens_new_context represents the tokens before the needle
  187. tokens_new_context = tokens_context[:insertion_point]
  188. # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
  189. period_tokens = self.encode_text_to_tokens('.')
  190. period_tokens = [30930]
  191. # Then we iteration backwards until we find the first period
  192. while tokens_new_context and tokens_new_context[-1] not in period_tokens:
  193. insertion_point -= 1
  194. tokens_new_context = tokens_context[:insertion_point]
  195. # Once we get there, then add in your needle, and stick the rest of your context in on the other end.
  196. # Now we have a needle in a haystack
  197. tokens_new_context += tokens_needle + tokens_context[insertion_point:]
  198. # Convert back to a string and return it
  199. new_context = self.decode_tokens(tokens_new_context)
  200. return new_context
  201. def get_context_length_in_tokens(self, context):
  202. if self.tokenizer_type == "OpenAI":
  203. return len(self.enc.encode(context))
  204. elif self.tokenizer_type == "Anthropic":
  205. # Assuming you have a different encoder for Anthropic
  206. return len(self.enc.encode(context).ids)
  207. elif self.tokenizer_type == "Huggingface":
  208. return self.enc(context, truncation=False, return_tensors="pt").input_ids.shape[-1]
  209. else:
  210. raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
  211. def read_context_files(self):
  212. context = ""
  213. max_context_length = max(self.context_lengths)
  214. while self.get_context_length_in_tokens(context) < max_context_length:
  215. for file in glob.glob(f"{self.haystack_dir}/*.txt"):
  216. with open(file, 'r') as f:
  217. context += f.read()
  218. return context
  219. def get_tokens_from_context(self, context):
  220. if self.tokenizer_type == "OpenAI":
  221. return self.enc.encode(context)
  222. elif self.tokenizer_type == "Anthropic":
  223. # Assuming you have a different encoder for Anthropic
  224. return self.enc.encode(context).ids
  225. elif self.tokenizer_type == "Huggingface":
  226. return self.enc(context, truncation=False, return_tensors="pt").input_ids.view(-1).tolist()
  227. else:
  228. raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
  229. def decode_tokens(self, tokens, context_length=None):
  230. if self.tokenizer_type == "OpenAI":
  231. return self.enc.decode(tokens[:context_length])
  232. elif self.tokenizer_type == "Anthropic":
  233. # Assuming you have a different decoder for Anthropic
  234. return self.enc.decode(tokens[:context_length])
  235. elif self.tokenizer_type == "Huggingface":
  236. decoded = self.enc.decode(tokens[:context_length], skip_special_tokens=True)
  237. return decoded
  238. else:
  239. raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
  240. def encode_and_trim(self, context, context_length):
  241. tokens = self.get_tokens_from_context(context)
  242. if len(tokens) > context_length:
  243. context = self.decode_tokens(tokens, context_length)
  244. return context
  245. def get_results(self):
  246. return self.testing_results
  247. def print_start_test_summary(self):
  248. print ("\n")
  249. print ("Starting Prompt Generation ...")
  250. print (f"- Tokenizer: {self.tokenizer_type}")
  251. print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
  252. print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
  253. print (f"- Needle: {self.needle.strip()}")
  254. print ("\n\n")
  255. def start_test(self):
  256. if self.print_ongoing_status:
  257. self.print_start_test_summary()
  258. asyncio.run(self.run_test())
  259. if __name__ == '__main__':
  260. with open('utils/needle_test/config-prompt.yaml', 'r') as file:
  261. config = yaml.load(file, Loader=yaml.FullLoader)
  262. parser = argparse.ArgumentParser()
  263. parser.add_argument('--model_name', type=str, default='None')
  264. args = parser.parse_args()
  265. ht = Prompter(
  266. needle=config['prompt']['needle'],
  267. haystack_dir=config['prompt']['haystack_dir'],
  268. retrieval_question=config['prompt']['retrieval_question'],
  269. context_lengths_min=config['context']['min_len'],
  270. context_lengths_max=config['context']['max_len'],
  271. context_lengths_num_intervals=config['context']['interval'],
  272. context_lengths=config['context']['manually_select_list'],
  273. document_depth_percent_min=config['document_depth']['min_percent'],
  274. document_depth_percent_max=config['document_depth']['max_percent'],
  275. document_depth_percent_intervals=config['document_depth']['interval'],
  276. document_depth_percents=config['document_depth']['manually_select_list'],
  277. document_depth_percent_interval_type=config['document_depth']['interval_type'],
  278. tokenizer_type=config['tokenizer']['tokenizer_type'],
  279. model_name=args.model_name,
  280. save_dir=config['save_dir'],
  281. )
  282. ht.start_test()