|  | @@ -14,16 +14,7 @@ import json
 | 
	
		
			
				|  |  |  # Initialize logging
 | 
	
		
			
				|  |  |  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -# Since OctoAI has different naming for llama models, get the huggingface offical model name using OctoAI names.
 | 
	
		
			
				|  |  | -def get_model_name(model):
 | 
	
		
			
				|  |  | -    if model == "meta-llama-3-70b-instruct":
 | 
	
		
			
				|  |  | -        return "meta-llama/Meta-Llama-3-70B-Instruct"
 | 
	
		
			
				|  |  | -    elif model == "meta-llama-3-8b-instruct":
 | 
	
		
			
				|  |  | -        return "meta-llama/Meta-Llama-3-8B-Instruct"
 | 
	
		
			
				|  |  | -    elif model == "llama-2-7b-chat":
 | 
	
		
			
				|  |  | -        return "meta-llama/Llama-2-7b-chat-hf"
 | 
	
		
			
				|  |  | -    else:
 | 
	
		
			
				|  |  | -        return "meta-llama/Llama-2-70b-chat-hf"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  def read_text_file(file_path):
 | 
	
		
			
				|  |  |      try:
 | 
	
		
			
				|  |  |          with open(file_path, 'r') as f:
 | 
	
	
		
			
				|  | @@ -88,8 +79,13 @@ def read_file_content(context):
 | 
	
		
			
				|  |  |      if len(text) == 0:
 | 
	
		
			
				|  |  |          logging.error(f"Error reading files, text is empty")
 | 
	
		
			
				|  |  |      return ' '.join(file_strings)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +# clean the text by removing all parts that did not contain any alphanumeric characters
 | 
	
		
			
				|  |  | +def clean(s):
 | 
	
		
			
				|  |  | +        result = []
 | 
	
		
			
				|  |  | +        for item in s.split('"'):
 | 
	
		
			
				|  |  | +            if any(c.isalnum() for c in item):
 | 
	
		
			
				|  |  | +                result.append(item)
 | 
	
		
			
				|  |  | +        return " ".join(result)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def parse_qa_to_json(response_string):
 | 
	
		
			
				|  |  |      split_lines = response_string.split("\n")
 | 
	
	
		
			
				|  | @@ -109,21 +105,21 @@ def parse_qa_to_json(response_string):
 | 
	
		
			
				|  |  |                  end = i
 | 
	
		
			
				|  |  |              # found Question means we have reached the end of the question, so add it to qa_list
 | 
	
		
			
				|  |  |              elif '"Question":' in line:
 | 
	
		
			
				|  |  | -                question = " ".join(" ".join(split_lines[start:end]).split('"Question":')[1].split('"')[1:-1])
 | 
	
		
			
				|  |  | -                answer = " ".join(" ".join(split_lines[end:i]).split('"Answer":')[1].split('"')[1:-1])
 | 
	
		
			
				|  |  | +                question = " ".join(split_lines[start:end]).split('"Question":')[1]
 | 
	
		
			
				|  |  | +                answer = " ".join(split_lines[end:i]).split('"Answer":')[1]
 | 
	
		
			
				|  |  |                  start,end = i,None
 | 
	
		
			
				|  |  | -                qa_set.add((question, answer))
 | 
	
		
			
				|  |  | +                qa_set.add((clean(question), clean(answer)))
 | 
	
		
			
				|  |  |          # adding last question back to qa_list
 | 
	
		
			
				|  |  | -        if start and end:
 | 
	
		
			
				|  |  | -            question = " ".join(" ".join(split_lines[start:end]).split('"Question":')[1].split('"')[1:-1])
 | 
	
		
			
				|  |  | -            answer = " ".join(" ".join(split_lines[end:i]).split('"Answer":')[1].split('"')[1:-1])
 | 
	
		
			
				|  |  | -            qa_set.add((question, answer))
 | 
	
		
			
				|  |  | +    if start and end:
 | 
	
		
			
				|  |  | +        question = " ".join(split_lines[start:end]).split('"Question":')[1]
 | 
	
		
			
				|  |  | +        answer = " ".join(split_lines[end:]).split('"Answer":')[1]
 | 
	
		
			
				|  |  | +        qa_set.add((clean(question), clean(answer)))
 | 
	
		
			
				|  |  |      qa_list = [{"question": q, "answer":a} for q,a in qa_set]
 | 
	
		
			
				|  |  |      return json.dumps(qa_list, indent=4)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -async def prepare_and_send_request(chat_service, api_context: dict, document_content: str, total_questions: int) -> dict:
 | 
	
		
			
				|  |  | -    prompt_for_system = api_context['question_prompt_template'].format(total_questions=total_questions, language=api_context["language"])
 | 
	
		
			
				|  |  | +async def prepare_and_send_request(chat_service, api_context: dict, document_content: str, num_questions: int) -> dict:
 | 
	
		
			
				|  |  | +    prompt_for_system = api_context['question_prompt_template'].format(num_questions=num_questions, language=api_context["language"])
 | 
	
		
			
				|  |  |      chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': document_content}]
 | 
	
		
			
				|  |  |      result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
 | 
	
		
			
				|  |  |      if not result:
 | 
	
	
		
			
				|  | @@ -142,7 +138,8 @@ async def generate_question_batches(chat_service, api_context: dict):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      total_questions = api_context["total_questions"]
 | 
	
		
			
				|  |  |      batches_count = len(document_batches)
 | 
	
		
			
				|  |  | -    base_questions_per_batch = total_questions // batches_count
 | 
	
		
			
				|  |  | +    # each batch should have at least 1 question
 | 
	
		
			
				|  |  | +    base_questions_per_batch = max(total_questions // batches_count,1)
 | 
	
		
			
				|  |  |      extra_questions = total_questions % batches_count
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      print(f"Questions per batch: {base_questions_per_batch} (+1 for the first {extra_questions} batches), Total questions: {total_questions}, Batches: {batches_count}")
 |