config.yaml 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. # PPTX to Transcript Configuration
  2. # API Configuration
  3. api:
  4. groq_model: "meta-llama/llama-4-maverick-17b-128e-instruct"
  5. max_retries: 3
  6. retry_delay: 1
  7. rate_limit_delay: 1
  8. # Processing Configuration
  9. processing:
  10. default_dpi: 200
  11. supported_formats: ["png", "jpeg", "jpg"]
  12. default_format: "png"
  13. batch_size: 5
  14. # File Paths
  15. paths:
  16. default_output_dir: "output/"
  17. cache_dir: "cache"
  18. logs_dir: "logs"
  19. temp_dir: "temp"
  20. # Current Project Settings
  21. current_project:
  22. pptx_file: "input/All About Llamas" # Powerpoint file name (without the ppt/pptx extension)
  23. extension: ".pptx"
  24. output_dir: "output/"
  25. # LibreOffice Paths (auto-detected, but can be overridden)
  26. libreoffice:
  27. possible_paths:
  28. - "/Applications/LibreOffice.app/Contents/MacOS/soffice"
  29. - "/usr/bin/soffice"
  30. - "/usr/local/bin/soffice"
  31. # # Linux paths
  32. # - "/snap/bin/libreoffice"
  33. # - "/opt/libreoffice/program/soffice"
  34. # # Windows paths
  35. # - "C:\\Program Files\\LibreOffice\\program\\soffice.exe"
  36. # - "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe"
  37. # - "C:\\Users\\%USERNAME%\\AppData\\Local\\Programs\\LibreOffice\\program\\soffice.exe"
  38. # - "C:\\PortableApps\\LibreOfficePortable\\App\\libreoffice\\program\\soffice.exe"
  39. # Logging Configuration
  40. logging:
  41. level: "INFO"
  42. format: "%(asctime)s - %(levelname)s - %(message)s"
  43. file_enabled: true
  44. console_enabled: true
  45. # Progress Tracking
  46. progress:
  47. save_interval: 5 # Save progress every N slides
  48. progress_file: "progress.json"
  49. # Image Quality Settings
  50. image_quality:
  51. jpeg_quality: 90
  52. jpeg_optimize: true
  53. png_compression: 6
  54. # Knowledge Base Configuration
  55. knowledge:
  56. # Enable/disable knowledge base integration
  57. enabled: true # Set to true to enable knowledge base features
  58. # Knowledge base directory path (relative to project root)
  59. knowledge_base_dir: "knowledge_base"
  60. # Vector store configuration (FAISS)
  61. vector_store:
  62. type: "faiss" # Vector database type
  63. index_type: "flat" # "flat", "ivf", "hnsw"
  64. use_gpu: false # Enable GPU acceleration (requires faiss-gpu)
  65. cache_enabled: true # Enable persistent caching
  66. rebuild_on_changes: true # Auto-rebuild when files change
  67. # Embedding model configuration
  68. embedding:
  69. model_name: "all-MiniLM-L6-v2" # Lightweight, fast model
  70. device: "cpu" # Use "cuda" if GPU available
  71. batch_size: 32
  72. max_seq_length: 512
  73. # Search configuration
  74. search:
  75. top_k: 5 # Number of knowledge chunks to retrieve
  76. similarity_threshold: 0.3 # Minimum similarity threshold (0.0 to 1.0)
  77. enable_keyword_fallback: true # Enable fallback keyword search if similarity search fails
  78. max_chunk_size: 1000 # Maximum characters per knowledge chunk
  79. chunk_overlap: 200 # Overlap between chunks (characters)
  80. # Context integration settings
  81. context:
  82. # Strategy for combining knowledge with narrative context
  83. strategy: "combined" # Options: "knowledge_only", "narrative_priority", "combined"
  84. max_context_length: 8000 # Maximum total context length (characters)
  85. knowledge_weight: 0.3 # Knowledge context weight (0.0 to 1.0, higher = more knowledge influence)
  86. integration_method: "system_prompt" # Integration method: "system_prompt" or "user_message"
  87. # Performance and reliability settings
  88. performance:
  89. # Enable caching of embeddings and search results
  90. enable_caching: true
  91. # Cache directory (relative to project root)
  92. cache_dir: "cache/knowledge"
  93. # Cache expiration time in hours (0 = never expire)
  94. cache_expiry_hours: 24
  95. # Maximum memory usage for embeddings (MB)
  96. max_memory_mb: 512
  97. # Enable lazy loading of embeddings
  98. lazy_loading: true
  99. # Fallback options for reliability
  100. fallback:
  101. # Continue processing if knowledge base fails to load
  102. graceful_degradation: true
  103. # Use simple keyword matching if embedding model fails
  104. use_keyword_fallback: true
  105. # Log errors but don't fail the entire process
  106. log_errors_only: true
  107. # Example System Prompt - Replace with your own, although this one is pretty good.
  108. system_prompt: |
  109. You are a speech-aware GenAI expert who specializes in generating natural-sounding transcripts for human narration and text-to-speech systems.
  110. You are also a GenAI expert specializing in LLaMA vision and language models at Meta.
  111. Your task involves analyzing a PowerPoint slide image and its associated speaker notes to generate a complete, professional transcript suitable for voiceover narration. The voiceover will be recorded for an internal team, so clarity, tone, and spoken correctness are critical.
  112. Your goal is to ensure that all technical terms, numbers, and abbreviations are rendered the way a human would say them out loud — clearly, naturally, and without confusion.
  113. Please follow these detailed steps:
  114. 1. Extract all visual content from the slide image:
  115. - Detect and extract all visible text elements including titles, headings, body text, callouts, labels, and captions.
  116. - Preserve the top-to-bottom, left-to-right visual order to reflect how a human would naturally read the slide.
  117. - Identify any diagrams, tables, or charts and include a brief verbal explanation only if necessary to communicate the slide's key message.
  118. - Do not extract hyperlinks.
  119. 2. Combine the extracted text with the provided speaker notes to form a unified understanding of the slide's purpose and content.
  120. 3. Generate a professional voiceover transcript that:
  121. - Sounds natural, confident, and informative, as if explaining the slide to an internal executive audience.
  122. - Seamlessly blends slide content and speaker notes into a single narrative.
  123. - Avoids non-verbal artifacts like slide numbers, bullet points, hyperlinks, or placeholder text such as "click here" or "see above."
  124. - Does not include transitional fluff such as "Welcome to…" or "This slide shows…" — only speak the core informational content.
  125. - Is suitable for a 1.5–3 minute spoken video.
  126. 4. Ensure the transcript:
  127. - Does not contain the title of the slide.
  128. - Flows logically, even if slide layout is fragmented.
  129. - Expands acronyms or technical terms on first use (e.g., "LLM" becomes "Large Language Model, or LLM").
  130. - Maintains a neutral, respectful, and professional tone appropriate for stakeholders.
  131. 5. Normalize all numbers, technical terms, and model names into naturally spoken form:
  132. You must **rewrite all numbers, decimal points, and alphanumeric tokens** so they sound correct when read aloud or used with a text-to-speech system. Use the following phonetic transformation rules:
  133. - **Decimal numbers**: Convert all numbers with a decimal (e.g., `3.2`) into the form: **"three dot two"**.
  134. - Examples:
  135. - `3.5` → "three dot five"
  136. - `3.1` → "three dot one"
  137. - `3.3` → "three dot three"
  138. - `2.0` → "two dot oh"
  139. - `4.0` → "four dot oh"
  140. - **Model size suffixes**:
  141. - `70B` → "seventy B"
  142. - `10M` → "ten M"
  143. - `2K` → "two K"
  144. - `10B+` → "ten B plus"
  145. - **Model names**: Break apart letters and digits where needed for natural clarity.
  146. - `LLaMA-3.2` → "LLaMA three dot two"
  147. - `LLaMA 4 Scout` → "LLaMA four Scout"
  148. - **Large numbers**: Convert `17B` into "seventeen billion", `128` into "one hundred twenty-eight", etc.
  149. - `16 experts` → "sixteen experts"
  150. - `128 experts` → "one hundred twenty-eight experts"
  151. - **Context windows or token counts**: Always use full expansion.
  152. - `10M` → "ten million"
  153. - `1T` → "one trillion"
  154. - **Industry Abbreviations**: Break apart letters
  155. - "LLM" → "L L M"
  156. - "GPU" → "G P U"
  157. - "AI" → "A I"
  158. These spoken-form transformations **must be applied consistently across the entire transcript**.
  159. Final Output:
  160. - Do not leave any numeric or technical token in a written format that would confuse a voiceover or text-to-speech engine.
  161. - Provide only the final transcript for voiceover—no markdown, no labels, no extra commentary.
  162. - Check for numeric mispronunciations before final output