ppt_parser.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # Note: No logic for images yet
  2. import os
  3. from pptx import Presentation
  4. class PPTParser:
  5. def __init__(self):
  6. pass
  7. def parse(self, file_path):
  8. if not os.path.exists(file_path):
  9. raise FileNotFoundError(f"PowerPoint file not found: {file_path}")
  10. if not file_path.lower().endswith(('.pptx')):
  11. raise ValueError("Only .pptx format is supported")
  12. ppt = Presentation(file_path)
  13. all_text = []
  14. all_text.append(f"PowerPoint Presentation: {os.path.basename(file_path)}")
  15. all_text.append(f"Total Slides: {len(ppt.slides)}")
  16. all_text.append("")
  17. for i, slide in enumerate(ppt.slides):
  18. slide_num = i + 1
  19. all_text.append(f"Slide {slide_num}")
  20. all_text.append("-" * 40)
  21. if slide.shapes.title:
  22. all_text.append(f"Title: {slide.shapes.title.text}")
  23. slide_text = self._extract_slide_text(slide)
  24. if slide_text:
  25. all_text.append(slide_text)
  26. all_text.append("")
  27. return "\n".join(all_text)
  28. def _extract_slide_text(self, slide):
  29. texts = []
  30. for shape in slide.shapes:
  31. if shape.has_text_frame:
  32. text = self._extract_text_frame(shape.text_frame)
  33. if text:
  34. texts.append(text)
  35. # table logic
  36. if shape.has_table:
  37. table_text = self._extract_table(shape.table)
  38. if table_text:
  39. texts.append(table_text)
  40. return "\n".join(texts)
  41. def _extract_text_frame(self, text_frame):
  42. text_lines = []
  43. for paragraph in text_frame.paragraphs:
  44. if paragraph.text.strip():
  45. text_lines.append(paragraph.text.strip())
  46. return "\n".join(text_lines)
  47. def _extract_table(self, table):
  48. table_lines = []
  49. for row in table.rows:
  50. row_text = []
  51. for cell in row.cells:
  52. cell_text = ""
  53. for paragraph in cell.text_frame.paragraphs:
  54. if paragraph.text.strip():
  55. cell_text += paragraph.text.strip() + " "
  56. row_text.append(cell_text.strip())
  57. if any(row_text): # Skip empty rows
  58. table_lines.append(" | ".join(row_text))
  59. return "\n".join(table_lines)
  60. def save(self, content, output_path):
  61. os.makedirs(os.path.dirname(output_path), exist_ok=True)
  62. with open(output_path, 'w', encoding='utf-8') as f:
  63. f.write(content)