1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- # Note: No logic for images yet
- import os
- from pptx import Presentation
- class PPTParser:
- def __init__(self):
- pass
-
- def parse(self, file_path):
- if not os.path.exists(file_path):
- raise FileNotFoundError(f"PowerPoint file not found: {file_path}")
- if not file_path.lower().endswith(('.pptx')):
- raise ValueError("Only .pptx format is supported")
-
- ppt = Presentation(file_path)
- all_text = []
- all_text.append(f"PowerPoint Presentation: {os.path.basename(file_path)}")
- all_text.append(f"Total Slides: {len(ppt.slides)}")
- all_text.append("")
-
- for i, slide in enumerate(ppt.slides):
- slide_num = i + 1
- all_text.append(f"Slide {slide_num}")
- all_text.append("-" * 40)
-
- if slide.shapes.title:
- all_text.append(f"Title: {slide.shapes.title.text}")
-
- slide_text = self._extract_slide_text(slide)
- if slide_text:
- all_text.append(slide_text)
-
- all_text.append("")
-
- return "\n".join(all_text)
-
- def _extract_slide_text(self, slide):
- texts = []
-
- for shape in slide.shapes:
- if shape.has_text_frame:
- text = self._extract_text_frame(shape.text_frame)
- if text:
- texts.append(text)
-
- # table logic
- if shape.has_table:
- table_text = self._extract_table(shape.table)
- if table_text:
- texts.append(table_text)
-
- return "\n".join(texts)
-
- def _extract_text_frame(self, text_frame):
- text_lines = []
-
- for paragraph in text_frame.paragraphs:
- if paragraph.text.strip():
- text_lines.append(paragraph.text.strip())
-
- return "\n".join(text_lines)
-
- def _extract_table(self, table):
- table_lines = []
-
- for row in table.rows:
- row_text = []
- for cell in row.cells:
- cell_text = ""
- for paragraph in cell.text_frame.paragraphs:
- if paragraph.text.strip():
- cell_text += paragraph.text.strip() + " "
- row_text.append(cell_text.strip())
-
- if any(row_text): # Skip empty rows
- table_lines.append(" | ".join(row_text))
-
- return "\n".join(table_lines)
-
- def save(self, content, output_path):
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
- with open(output_path, 'w', encoding='utf-8') as f:
- f.write(content)
|