|
@@ -0,0 +1,49 @@
|
|
|
+import tiktoken
|
|
|
+
|
|
|
+# Assuming result_average_token is a constant, use UPPER_CASE for its name to follow Python conventions
|
|
|
+AVERAGE_TOKENS_PER_RESULT = 100
|
|
|
+
|
|
|
+def get_token_limit_for_model(model: str) -> int:
|
|
|
+ """Returns the token limit for a given model."""
|
|
|
+ if model == "gpt-3.5-turbo-16k":
|
|
|
+ return 16384
|
|
|
+ # Consider adding an else statement or default return value if more models are expected
|
|
|
+
|
|
|
+def fetch_encoding_for_model(model="gpt-3.5-turbo-16k"):
|
|
|
+ """Fetches the encoding for the specified model."""
|
|
|
+ try:
|
|
|
+ return tiktoken.encoding_for_model(model)
|
|
|
+ except KeyError:
|
|
|
+ print("Warning: Model not found. Using 'cl100k_base' encoding as default.")
|
|
|
+ return tiktoken.get_encoding("cl100k_base")
|
|
|
+
|
|
|
+def calculate_num_tokens_for_message(message: str, model="gpt-3.5-turbo-16k") -> int:
|
|
|
+ """Calculates the number of tokens used by a message."""
|
|
|
+ encoding = fetch_encoding_for_model(model)
|
|
|
+ # Added 3 to account for priming with assistant's reply, as per original comment
|
|
|
+ return len(encoding.encode(message)) + 3
|
|
|
+
|
|
|
+def split_text_into_tokenized_chunks(context: dict, text_to_split: str) -> list[str]:
|
|
|
+ """Splits a long string into substrings based on token length constraints."""
|
|
|
+ max_tokens_per_chunk = (
|
|
|
+ get_token_limit_for_model(context["model"]) -
|
|
|
+ calculate_num_tokens_for_message(context["question_prompt_template"]) -
|
|
|
+ AVERAGE_TOKENS_PER_RESULT * context["total_questions"]
|
|
|
+ )
|
|
|
+ substrings = []
|
|
|
+ chunk_tokens = []
|
|
|
+
|
|
|
+ encoding = fetch_encoding_for_model(context["model"])
|
|
|
+ text_tokens = encoding.encode(text_to_split)
|
|
|
+
|
|
|
+ for token in text_tokens:
|
|
|
+ if len(chunk_tokens) + 1 > max_tokens_per_chunk:
|
|
|
+ substrings.append(encoding.decode(chunk_tokens).strip())
|
|
|
+ chunk_tokens = [token]
|
|
|
+ else:
|
|
|
+ chunk_tokens.append(token)
|
|
|
+
|
|
|
+ if chunk_tokens:
|
|
|
+ substrings.append(encoding.decode(chunk_tokens).strip())
|
|
|
+
|
|
|
+ return substrings
|