2 лет назад · 2f884a497b
--- a/tutorials/chatbot/data_pipelines/generator_utils.py
+++ b/tutorials/chatbot/data_pipelines/generator_utils.py
@@ -8,15 +8,14 @@ import magic
 
				 from PyPDF2 import PdfReader
			
 
				 from functools import partial
			
 
				 import json
			
 
				-from token_processor import split_text_into_tokenized_chunks
			
 
				-# from file_handler import read_file_content
			
 
				+from doc_processor import split_text_into_chunks
			
 
				 import logging
			
 
				 
			
 
				 # Initialize logging
			
 
				 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				 
			
 
				 # Manage rate limits with throttling
			
 
				-rate_limit_threshold = 100
			
 
				+rate_limit_threshold = 2000
			
 
				 allowed_concurrent_requests = int(rate_limit_threshold * 0.75)
			
 
				 request_limiter = asyncio.Semaphore(allowed_concurrent_requests)
			
 
				 
			
@@ -87,25 +86,26 @@ async def prepare_and_send_request(api_context: dict, document_content: str, tot
 
				     return json.loads(await execute_chat_request_async(api_context, chat_request_payload))
			
 
				 
			
 
				 async def generate_question_batches(api_context: dict):
			
 
				-    
			
 
				     document_text = read_file_content(api_context)
			
 
				-    print("completed step 1")
			
 
				-    document_batches = split_text_into_tokenized_chunks(api_context, document_text)
			
 
				-    print("completed step 2")
			
 
				-
			
 
				-    questions_per_batch = api_context["total_questions"] // len(document_batches)
			
 
				-    print("completed step 3")
			
 
				+    document_batches = split_text_into_chunks(api_context, document_text)
			
 
				+    
			
 
				+    total_questions = api_context["total_questions"]
			
 
				+    batches_count = len(document_batches)
			
 
				+    base_questions_per_batch = total_questions // batches_count
			
 
				+    extra_questions = total_questions % batches_count
			
 
				 
			
 
				+    print(f"Questions per batch: {base_questions_per_batch} (+1 for the first {extra_questions} batches), Total questions: {total_questions}, Batches: {batches_count}")
			
 
				+    
			
 
				     generation_tasks = []
			
 
				     for batch_index, batch_content in enumerate(document_batches):
			
 
				-        questions_in_current_batch = questions_per_batch + 1 if batch_index == len(document_batches) - 1 and len(document_batches) * questions_per_batch < api_context["total_questions"] else questions_per_batch
			
 
				+        # Distribute extra questions across the first few batches
			
 
				+        questions_in_current_batch = base_questions_per_batch + (1 if batch_index < extra_questions else 0)
			
 
				+        print(f"Batch {batch_index + 1} - {questions_in_current_batch} questions ********")
			
 
				         generation_tasks.append(prepare_and_send_request(api_context, batch_content, questions_in_current_batch))
			
 
				-    print("completed step 4")
			
 
				-
			
 
				 
			
 
				     question_generation_results = await asyncio.gather(*generation_tasks)
			
 
				-    print("completed step 5")
			
 
				 
			
 
				     return question_generation_results
			
 
				 
			
 
				 
			
 
				+