|  | @@ -1,49 +0,0 @@
 | 
	
		
			
				|  |  | -import os
 | 
	
		
			
				|  |  | -import magic
 | 
	
		
			
				|  |  | -from PyPDF2 import PdfReader
 | 
	
		
			
				|  |  | -import logging
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -# Initialize logging
 | 
	
		
			
				|  |  | -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def read_text_file(file_path):
 | 
	
		
			
				|  |  | -    try:
 | 
	
		
			
				|  |  | -        with open(file_path, 'r') as f:
 | 
	
		
			
				|  |  | -            return f.read().strip() + ' '
 | 
	
		
			
				|  |  | -    except Exception as e:
 | 
	
		
			
				|  |  | -        logging.error(f"Error reading text file {file_path}: {e}")
 | 
	
		
			
				|  |  | -    return ''
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def read_pdf_file(file_path):
 | 
	
		
			
				|  |  | -    try:
 | 
	
		
			
				|  |  | -        with open(file_path, 'rb') as f:
 | 
	
		
			
				|  |  | -            pdf_reader = PdfReader(f)
 | 
	
		
			
				|  |  | -            num_pages = len(pdf_reader.pages)
 | 
	
		
			
				|  |  | -            file_text = [pdf_reader.pages[page_num].extract_text().strip() + ' ' for page_num in range(num_pages)]
 | 
	
		
			
				|  |  | -            return ''.join(file_text)
 | 
	
		
			
				|  |  | -    except Exception as e:
 | 
	
		
			
				|  |  | -        logging.error(f"Error reading PDF file {file_path}: {e}")
 | 
	
		
			
				|  |  | -    return ''
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def process_file(file_path):
 | 
	
		
			
				|  |  | -    file_type = magic.from_file(file_path, mime=True)
 | 
	
		
			
				|  |  | -    if file_type in ['text/plain', 'text/markdown']:
 | 
	
		
			
				|  |  | -        return read_text_file(file_path)
 | 
	
		
			
				|  |  | -    elif file_type == 'application/pdf':
 | 
	
		
			
				|  |  | -        return read_pdf_file(file_path)
 | 
	
		
			
				|  |  | -    else:
 | 
	
		
			
				|  |  | -        logging.warning(f"Unsupported file type {file_type} for file {file_path}")
 | 
	
		
			
				|  |  | -        return ''
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def get_file_string(context):
 | 
	
		
			
				|  |  | -    file_strings = []
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    for root, _, files in os.walk(context['data_dir']):
 | 
	
		
			
				|  |  | -        for file in files:
 | 
	
		
			
				|  |  | -            file_path = os.path.join(root, file)
 | 
	
		
			
				|  |  | -            file_text = process_file(file_path)
 | 
	
		
			
				|  |  | -            if file_text:
 | 
	
		
			
				|  |  | -                file_strings.append(file_text)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    return ' '.join(file_strings)
 | 
	
		
			
				|  |  | -
 |