transcript_generator.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. """Transcript generation processor for PPTX slides."""
  2. from pathlib import Path
  3. from typing import Optional, Union
  4. import pandas as pd
  5. from tqdm import tqdm
  6. from ..config.settings import get_processing_config
  7. from ..core.llama_client import LlamaClient
  8. class TranscriptProcessor:
  9. """Processor for generating transcripts from slide images and notes."""
  10. def __init__(self, api_key: Optional[str] = None):
  11. """
  12. Initialize transcript processor.
  13. Args:
  14. api_key: Llama API key. If None, will be loaded from config/environment.
  15. """
  16. self.client = LlamaClient(api_key=api_key)
  17. self.processing_config = get_processing_config()
  18. def process_single_slide(
  19. self,
  20. image_path: Union[str, Path],
  21. speaker_notes: str = "",
  22. system_prompt: Optional[str] = None,
  23. ) -> str:
  24. """
  25. Process a single slide to generate transcript.
  26. Args:
  27. image_path: Path to the slide image
  28. speaker_notes: Speaker notes for the slide
  29. system_prompt: Custom system prompt. If None, uses default from config.
  30. Returns:
  31. Generated transcript text
  32. """
  33. return self.client.generate_transcript(
  34. image_path=str(image_path),
  35. speaker_notes=speaker_notes,
  36. system_prompt=system_prompt,
  37. stream=False,
  38. )
  39. def process_slides_dataframe(
  40. self,
  41. df: pd.DataFrame,
  42. output_dir: Union[str, Path],
  43. system_prompt: Optional[str] = None,
  44. ) -> pd.DataFrame:
  45. """
  46. Process slides from a DataFrame containing slide information.
  47. Args:
  48. df: DataFrame with slide information (from extract_pptx_notes)
  49. output_dir: Directory containing slide images
  50. system_prompt: Custom system prompt. If None, uses default from config.
  51. Returns:
  52. DataFrame with added 'ai_transcript' column
  53. """
  54. output_dir = Path(output_dir)
  55. df_copy = df.copy()
  56. for i in tqdm(range(len(df_copy)), desc="Processing slides"):
  57. # Get data for current slide
  58. slide_filename = df_copy.iloc[i]["image_filename"]
  59. speaker_notes = (
  60. df_copy.iloc[i]["speaker_notes"]
  61. if pd.notna(df_copy.iloc[i]["speaker_notes"])
  62. else ""
  63. )
  64. image_path = output_dir / slide_filename
  65. # Generate transcript
  66. transcript = self.process_single_slide(
  67. image_path=image_path,
  68. speaker_notes=speaker_notes,
  69. system_prompt=system_prompt,
  70. )
  71. # Add to dataframe
  72. df_copy.loc[i, "ai_transcript"] = transcript
  73. return df_copy
  74. def process_slides(
  75. df: pd.DataFrame,
  76. output_dir: Union[str, Path] = "slide_images",
  77. api_key: Optional[str] = None,
  78. system_prompt: Optional[str] = None,
  79. ) -> pd.DataFrame:
  80. """
  81. Legacy function for backward compatibility with notebook code.
  82. Process slides from a DataFrame to generate transcripts.
  83. Args:
  84. df: DataFrame with slide information (from extract_pptx_notes)
  85. output_dir: Directory containing slide images
  86. api_key: Llama API key. If None, will be loaded from config/environment.
  87. system_prompt: Custom system prompt. If None, uses default from config.
  88. Returns:
  89. DataFrame with added 'ai_transcript' column
  90. """
  91. processor = TranscriptProcessor(api_key=api_key)
  92. return processor.process_slides_dataframe(df, output_dir, system_prompt)