Browse Source

Create docx_parser.py

Sanyam Bhutani 1 month ago
parent
commit
2796834d38
1 changed files with 57 additions and 0 deletions
  1. 57 0
      end-to-end-use-cases/data-tool/src/parsers/docx_parser.py

+ 57 - 0
end-to-end-use-cases/data-tool/src/parsers/docx_parser.py

@@ -0,0 +1,57 @@
+import os
+import docx
+
+class DOCXParser:
+    def __init__(self):
+        pass
+    
+    def parse(self, file_path):
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"DOCX file not found: {file_path}")
+        doc = docx.Document(file_path)
+        full_text = []
+        elements = []
+        for i, para in enumerate(doc.paragraphs):
+            if para.text.strip():  # Skip empty paragraphs
+                elements.append(('paragraph', i, para))
+        table_indices = []
+        last_para_idx = 0
+        for i, para in enumerate(doc.paragraphs):
+            last_para_idx = i
+            
+        for i, table in enumerate(doc.tables):
+            approx_pos = last_para_idx + i + 1
+            elements.append(('table', approx_pos, table))
+        elements.sort(key=lambda x: x[1])
+        
+        for elem_type, _, elem in elements:
+            if elem_type == 'paragraph':
+                full_text.append(elem.text)
+            elif elem_type == 'table':
+                # Process the table
+                table_text = self._process_table(elem)
+                full_text.extend(table_text)
+        
+        return '\n'.join(full_text)
+    #the logic for tables is a bit overkill but it works quite well
+    def _process_table(self, table):
+        table_lines = []
+        
+        for row in table.rows:
+            row_texts = []
+            for cell in row.cells:
+                cell_text = cell.text.strip()
+                if cell_text:
+                    row_texts.append(cell_text)
+            
+            if row_texts:
+                table_lines.append(' | '.join(row_texts))
+        if table_lines:
+            table_lines.append('')
+            
+        return table_lines
+    
+    def save(self, content, output_path):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(content)