2 年前 · a34793f20e
--- a/tutorials/chatbot/data_pipelines/file_handler.py
+++ b/tutorials/chatbot/data_pipelines/file_handler.py
@@ -1,49 +0,0 @@
 
				-import os
			
 
				-import magic
			
 
				-from PyPDF2 import PdfReader
			
 
				-import logging
			
 
				-
			
 
				-# Initialize logging
			
 
				-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				-
			
 
				-def read_text_file(file_path):
			
 
				-    try:
			
 
				-        with open(file_path, 'r') as f:
			
 
				-            return f.read().strip() + ' '
			
 
				-    except Exception as e:
			
 
				-        logging.error(f"Error reading text file {file_path}: {e}")
			
 
				-    return ''
			
 
				-
			
 
				-def read_pdf_file(file_path):
			
 
				-    try:
			
 
				-        with open(file_path, 'rb') as f:
			
 
				-            pdf_reader = PdfReader(f)
			
 
				-            num_pages = len(pdf_reader.pages)
			
 
				-            file_text = [pdf_reader.pages[page_num].extract_text().strip() + ' ' for page_num in range(num_pages)]
			
 
				-            return ''.join(file_text)
			
 
				-    except Exception as e:
			
 
				-        logging.error(f"Error reading PDF file {file_path}: {e}")
			
 
				-    return ''
			
 
				-
			
 
				-def process_file(file_path):
			
 
				-    file_type = magic.from_file(file_path, mime=True)
			
 
				-    if file_type in ['text/plain', 'text/markdown']:
			
 
				-        return read_text_file(file_path)
			
 
				-    elif file_type == 'application/pdf':
			
 
				-        return read_pdf_file(file_path)
			
 
				-    else:
			
 
				-        logging.warning(f"Unsupported file type {file_type} for file {file_path}")
			
 
				-        return ''
			
 
				-
			
 
				-def get_file_string(context):
			
 
				-    file_strings = []
			
 
				-
			
 
				-    for root, _, files in os.walk(context['data_dir']):
			
 
				-        for file in files:
			
 
				-            file_path = os.path.join(root, file)
			
 
				-            file_text = process_file(file_path)
			
 
				-            if file_text:
			
 
				-                file_strings.append(file_text)
			
 
				-
			
 
				-    return ' '.join(file_strings)
			
 
				-