doc_processor.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. # Assuming result_average_token is a constant, use UPPER_CASE for its name to follow Python conventions
  4. AVERAGE_TOKENS_PER_RESULT = 100
  5. def get_token_limit_for_model(model: str) -> int:
  6. """Returns the token limit for a given model."""
  7. if model == "llama-2-70b-chat-fp16" or model == "llama-2-13b-chat-turbo":
  8. return 4096
  9. def calculate_num_tokens_for_message(encoded_text) -> int:
  10. """Calculates the number of tokens used by a message."""
  11. # Added 3 to account for priming with assistant's reply, as per original comment
  12. return len(encoded_text) + 3
  13. def split_text_into_chunks(context: dict, text: str, tokenizer) -> list[str]:
  14. """Splits a long text into substrings based on token length constraints, adjusted for question generation."""
  15. # Adjusted approach to calculate max tokens available for text chunks
  16. encoded_text = tokenizer(text, return_tensors="pt", padding=True)["input_ids"]
  17. encoded_text = encoded_text.squeeze()
  18. model_token_limit = get_token_limit_for_model(context["model"])
  19. tokens_for_questions = calculate_num_tokens_for_message(encoded_text)
  20. estimated_tokens_per_question = AVERAGE_TOKENS_PER_RESULT
  21. estimated_total_question_tokens = estimated_tokens_per_question * context["total_questions"]
  22. # Ensure there's a reasonable minimum chunk size
  23. max_tokens_for_text = max(model_token_limit - tokens_for_questions - estimated_total_question_tokens, model_token_limit // 10)
  24. chunks, current_chunk = [], []
  25. print(f"Splitting text into chunks of {max_tokens_for_text} tokens, encoded_text {len(encoded_text)}", flush=True)
  26. for token in encoded_text:
  27. if len(current_chunk) >= max_tokens_for_text:
  28. chunks.append(tokenizer.decode(current_chunk).strip())
  29. current_chunk = []
  30. else:
  31. current_chunk.append(token)
  32. if current_chunk:
  33. chunks.append(tokenizer.decode(current_chunk).strip())
  34. print(f"Number of chunks in the processed text: {len(chunks)}", flush=True)
  35. return chunks