|
@@ -4,21 +4,33 @@
|
|
|
import os
|
|
|
import re
|
|
|
from transformers import AutoTokenizer
|
|
|
-from octoai.client import Client
|
|
|
import asyncio
|
|
|
import magic
|
|
|
from PyPDF2 import PdfReader
|
|
|
import json
|
|
|
from doc_processor import split_text_into_chunks
|
|
|
import logging
|
|
|
+import json
|
|
|
# Initialize logging
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
-
|
|
|
+# Since OctoAI has different naming for llama models, get the huggingface offical model name using OctoAI names.
|
|
|
+def get_model_name(model):
|
|
|
+ if model == "meta-llama-3-70b-instruct":
|
|
|
+ return "meta-llama/Meta-Llama-3-70B-Instruct"
|
|
|
+ elif model == "meta-llama-3-8b-instruct":
|
|
|
+ return "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
|
+ elif model == "llama-2-7b-chat":
|
|
|
+ return "meta-llama/Llama-2-7b-chat-hf"
|
|
|
+ else:
|
|
|
+ return "meta-llama/Llama-2-70b-chat-hf"
|
|
|
def read_text_file(file_path):
|
|
|
try:
|
|
|
with open(file_path, 'r') as f:
|
|
|
- return f.read().strip() + ' '
|
|
|
+ text = f.read().strip() + ' '
|
|
|
+ if len(text) == 0:
|
|
|
+ print("File is empty ",file_path)
|
|
|
+ return text
|
|
|
except Exception as e:
|
|
|
logging.error(f"Error reading text file {file_path}: {e}")
|
|
|
return ''
|
|
@@ -29,6 +41,9 @@ def read_pdf_file(file_path):
|
|
|
pdf_reader = PdfReader(f)
|
|
|
num_pages = len(pdf_reader.pages)
|
|
|
file_text = [pdf_reader.pages[page_num].extract_text().strip() + ' ' for page_num in range(num_pages)]
|
|
|
+ text = ''.join(file_text)
|
|
|
+ if len(text) == 0:
|
|
|
+ print("File is empty ",file_path)
|
|
|
return ''.join(file_text)
|
|
|
except Exception as e:
|
|
|
logging.error(f"Error reading PDF file {file_path}: {e}")
|
|
@@ -41,6 +56,8 @@ def read_json_file(file_path):
|
|
|
# Assuming each item in the list has a 'question' and 'answer' key
|
|
|
# Concatenating question and answer pairs with a space in between and accumulating them into a single string
|
|
|
file_text = ' '.join([item['question'].strip() + ' ' + item['answer'].strip() + ' ' for item in data])
|
|
|
+ if len(file_text) == 0:
|
|
|
+ print("File is empty ",file_path)
|
|
|
return file_text
|
|
|
except Exception as e:
|
|
|
logging.error(f"Error reading JSON file {file_path}: {e}")
|
|
@@ -48,6 +65,7 @@ def read_json_file(file_path):
|
|
|
|
|
|
|
|
|
def process_file(file_path):
|
|
|
+ print("starting to process file: ", file_path)
|
|
|
file_type = magic.from_file(file_path, mime=True)
|
|
|
if file_type in ['text/plain', 'text/markdown', 'JSON']:
|
|
|
return read_text_file(file_path)
|
|
@@ -66,36 +84,56 @@ def read_file_content(context):
|
|
|
file_text = process_file(file_path)
|
|
|
if file_text:
|
|
|
file_strings.append(file_text)
|
|
|
-
|
|
|
+ text = ' '.join(file_strings)
|
|
|
+ if len(text) == 0:
|
|
|
+ logging.error(f"Error reading files, text is empty")
|
|
|
return ' '.join(file_strings)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_qa_to_json(response_string):
|
|
|
- # Adjusted regex to capture question-answer pairs more flexibly
|
|
|
- # This pattern accounts for optional numbering and different question/answer lead-ins
|
|
|
- pattern = re.compile(
|
|
|
- r"\d*\.\s*Question:\s*(.*?)\nAnswer:\s*(.*?)(?=\n\d*\.\s*Question:|\Z)",
|
|
|
- re.DOTALL
|
|
|
- )
|
|
|
-
|
|
|
- # Find all matches in the response string
|
|
|
- matches = pattern.findall(response_string)
|
|
|
-
|
|
|
- # Convert matches to a structured format
|
|
|
- qa_list = [{"question": match[0].strip(), "answer": match[1].strip()} for match in matches]
|
|
|
-
|
|
|
- # Convert the list to a JSON string
|
|
|
+ split_lines = response_string.split("\n")
|
|
|
+ start,end = None,None
|
|
|
+ # must use set to avoid duplicate question/answer pairs due to async function calls
|
|
|
+ qa_set = set()
|
|
|
+ for i in range(len(split_lines)):
|
|
|
+ line = split_lines[i]
|
|
|
+ # starting to find "Question"
|
|
|
+ if not start:
|
|
|
+ # Once found, set start to this line number
|
|
|
+ if '"Question":' in line:
|
|
|
+ start = i
|
|
|
+ else:
|
|
|
+ # "Question" has been found, find "Answer", once found, set end to this line number
|
|
|
+ if '"Answer":' in line:
|
|
|
+ end = i
|
|
|
+ # found Question means we have reached the end of the question, so add it to qa_list
|
|
|
+ elif '"Question":' in line:
|
|
|
+ question = " ".join(" ".join(split_lines[start:end]).split('"Question":')[1].split('"')[1:-1])
|
|
|
+ answer = " ".join(" ".join(split_lines[end:i]).split('"Answer":')[1].split('"')[1:-1])
|
|
|
+ start,end = i,None
|
|
|
+ qa_set.add((question, answer))
|
|
|
+ # adding last question back to qa_list
|
|
|
+ if start and end:
|
|
|
+ question = " ".join(" ".join(split_lines[start:end]).split('"Question":')[1].split('"')[1:-1])
|
|
|
+ answer = " ".join(" ".join(split_lines[end:i]).split('"Answer":')[1].split('"')[1:-1])
|
|
|
+ qa_set.add((question, answer))
|
|
|
+ qa_list = [{"question": q, "answer":a} for q,a in qa_set]
|
|
|
return json.dumps(qa_list, indent=4)
|
|
|
|
|
|
|
|
|
async def prepare_and_send_request(chat_service, api_context: dict, document_content: str, total_questions: int) -> dict:
|
|
|
prompt_for_system = api_context['question_prompt_template'].format(total_questions=total_questions, language=api_context["language"])
|
|
|
chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': document_content}]
|
|
|
+ result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
|
|
|
+ if not result:
|
|
|
+ return {}
|
|
|
return json.loads(await chat_service.execute_chat_request_async(api_context, chat_request_payload))
|
|
|
|
|
|
async def generate_question_batches(chat_service, api_context: dict):
|
|
|
document_text = read_file_content(api_context)
|
|
|
+ if len(document_text)== 0:
|
|
|
+ logging.error(f"Error reading files, document_text is empty")
|
|
|
if api_context["model"] in ["meta-llama-3-70b-instruct","meta-llama-3-8b-instruct"]:
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", pad_token="</s>", padding_side="right")
|
|
|
else:
|
|
@@ -114,7 +152,11 @@ async def generate_question_batches(chat_service, api_context: dict):
|
|
|
#Distribute extra questions across the first few batches
|
|
|
questions_in_current_batch = base_questions_per_batch + (1 if batch_index < extra_questions else 0)
|
|
|
print(f"Batch {batch_index + 1} - {questions_in_current_batch} questions ********")
|
|
|
- generation_tasks.append(prepare_and_send_request(chat_service, api_context, batch_content, questions_in_current_batch))
|
|
|
+ try:
|
|
|
+ result = prepare_and_send_request(chat_service, api_context, batch_content, questions_in_current_batch)
|
|
|
+ generation_tasks.append(result)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Error during chat request execution: {e}")
|
|
|
|
|
|
question_generation_results = await asyncio.gather(*generation_tasks)
|
|
|
|