radu
/
LLamaRecipes
mirror de https://github.com/facebookresearch/llama-recipes.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
							# PPTX to Transcript Configuration

# API Configuration
api:
  groq_model: "meta-llama/llama-4-maverick-17b-128e-instruct"
  max_retries: 3
  retry_delay: 1
  rate_limit_delay: 1

# Processing Configuration
processing:
  default_dpi: 200
  supported_formats: ["png", "jpeg", "jpg"]
  default_format: "png"
  batch_size: 5

# File Paths
paths:
  default_output_dir: "output/"
  cache_dir: "cache"
  logs_dir: "logs"
  temp_dir: "temp"

# Current Project Settings
current_project:
  pptx_file: "input/All About Llamas" # Powerpoint file name (without the ppt/pptx extension)
  extension: ".pptx"
  output_dir: "output/"

# LibreOffice Paths (auto-detected, but can be overridden)
libreoffice:
  possible_paths:
    - "/Applications/LibreOffice.app/Contents/MacOS/soffice"
    - "/usr/bin/soffice"
    - "/usr/local/bin/soffice"
    # # Linux paths
    # - "/snap/bin/libreoffice"
    # - "/opt/libreoffice/program/soffice"
    # # Windows paths
    # - "C:\\Program Files\\LibreOffice\\program\\soffice.exe"
    # - "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe"
    # - "C:\\Users\\%USERNAME%\\AppData\\Local\\Programs\\LibreOffice\\program\\soffice.exe"
    # - "C:\\PortableApps\\LibreOfficePortable\\App\\libreoffice\\program\\soffice.exe"

# Logging Configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(levelname)s - %(message)s"
  file_enabled: true
  console_enabled: true

# Progress Tracking
progress:
  save_interval: 5  # Save progress every N slides
  progress_file: "progress.json"

# Image Quality Settings
image_quality:
  jpeg_quality: 90
  jpeg_optimize: true
  png_compression: 6

# Knowledge Base Configuration
knowledge:
  # Enable/disable knowledge base integration
  enabled: true  # Set to true to enable knowledge base features

  # Knowledge base directory path (relative to project root)
  knowledge_base_dir: "knowledge_base"

  # Vector store configuration (FAISS)
  vector_store:
    type: "faiss"                  # Vector database type
    index_type: "flat"             # "flat", "ivf", "hnsw"
    use_gpu: false                 # Enable GPU acceleration (requires faiss-gpu)
    cache_enabled: true            # Enable persistent caching
    rebuild_on_changes: true       # Auto-rebuild when files change

  # Embedding model configuration
  embedding:
    model_name: "all-MiniLM-L6-v2" # Lightweight, fast model
    device: "cpu"                  # Use "cuda" if GPU available
    batch_size: 32
    max_seq_length: 512

  # Search configuration
  search:
    top_k: 5                      # Number of knowledge chunks to retrieve
    similarity_threshold: 0.3     # Minimum similarity threshold (0.0 to 1.0)
    enable_keyword_fallback: true # Enable fallback keyword search if similarity search fails
    max_chunk_size: 1000          # Maximum characters per knowledge chunk
    chunk_overlap: 200            # Overlap between chunks (characters)

  # Context integration settings
  context:
    # Strategy for combining knowledge with narrative context
    strategy: "combined"          # Options: "knowledge_only", "narrative_priority", "combined"
    max_context_length: 8000      # Maximum total context length (characters)
    knowledge_weight: 0.3         # Knowledge context weight (0.0 to 1.0, higher = more knowledge influence)
    integration_method: "system_prompt" # Integration method: "system_prompt" or "user_message"

  # Performance and reliability settings
  performance:
    # Enable caching of embeddings and search results
    enable_caching: true
    # Cache directory (relative to project root)
    cache_dir: "cache/knowledge"
    # Cache expiration time in hours (0 = never expire)
    cache_expiry_hours: 24
    # Maximum memory usage for embeddings (MB)
    max_memory_mb: 512
    # Enable lazy loading of embeddings
    lazy_loading: true

  # Fallback options for reliability
  fallback:
    # Continue processing if knowledge base fails to load
    graceful_degradation: true
    # Use simple keyword matching if embedding model fails
    use_keyword_fallback: true
    # Log errors but don't fail the entire process
    log_errors_only: true

# Example System Prompt - Replace with your own, although this one is pretty good.
system_prompt: |
  You are a speech-aware GenAI expert who specializes in generating natural-sounding transcripts for human narration and text-to-speech systems.
  You are also a GenAI expert specializing in LLaMA vision and language models at Meta.

  Your task involves analyzing a PowerPoint slide image and its associated speaker notes to generate a complete, professional transcript suitable for voiceover narration. The voiceover will be recorded for an internal team, so clarity, tone, and spoken correctness are critical.
  Your goal is to ensure that all technical terms, numbers, and abbreviations are rendered the way a human would say them out loud — clearly, naturally, and without confusion.

  Please follow these detailed steps:

  1. Extract all visual content from the slide image:
     - Detect and extract all visible text elements including titles, headings, body text, callouts, labels, and captions.
     - Preserve the top-to-bottom, left-to-right visual order to reflect how a human would naturally read the slide.
     - Identify any diagrams, tables, or charts and include a brief verbal explanation only if necessary to communicate the slide's key message.
     - Do not extract hyperlinks.

  2. Combine the extracted text with the provided speaker notes to form a unified understanding of the slide's purpose and content.

  3. Generate a professional voiceover transcript that:
     - Sounds natural, confident, and informative, as if explaining the slide to an internal executive audience.
     - Seamlessly blends slide content and speaker notes into a single narrative.
     - Avoids non-verbal artifacts like slide numbers, bullet points, hyperlinks, or placeholder text such as "click here" or "see above."
     - Does not include transitional fluff such as "Welcome to…" or "This slide shows…" — only speak the core informational content.
     - Is suitable for a 1.5–3 minute spoken video.

  4. Ensure the transcript:
     - Does not contain the title of the slide.
     - Flows logically, even if slide layout is fragmented.
     - Expands acronyms or technical terms on first use (e.g., "LLM" becomes "Large Language Model, or LLM").
     - Maintains a neutral, respectful, and professional tone appropriate for stakeholders.

  5. Normalize all numbers, technical terms, and model names into naturally spoken form:

     You must **rewrite all numbers, decimal points, and alphanumeric tokens** so they sound correct when read aloud or used with a text-to-speech system. Use the following phonetic transformation rules:

     - **Decimal numbers**: Convert all numbers with a decimal (e.g., `3.2`) into the form: **"three dot two"**.
       - Examples:
         - `3.5` → "three dot five"
         - `3.1` → "three dot one"
         - `3.3` → "three dot three"
         - `2.0` → "two dot oh"
         - `4.0` → "four dot oh"

     - **Model size suffixes**:
       - `70B` → "seventy B"
       - `10M` → "ten M"
       - `2K` → "two K"
       - `10B+` → "ten B plus"

     - **Model names**: Break apart letters and digits where needed for natural clarity.
       - `LLaMA-3.2` → "LLaMA three dot two"
       - `LLaMA 4 Scout` → "LLaMA four Scout"

     - **Large numbers**: Convert `17B` into "seventeen billion", `128` into "one hundred twenty-eight", etc.
       - `16 experts` → "sixteen experts"
       - `128 experts` → "one hundred twenty-eight experts"

     - **Context windows or token counts**: Always use full expansion.
       - `10M` → "ten million"
       - `1T` → "one trillion"

     - **Industry Abbreviations**: Break apart letters
       - "LLM" → "L L M"
       - "GPU" → "G P U"
       - "AI" → "A I"

     These spoken-form transformations **must be applied consistently across the entire transcript**.

  Final Output:
  - Do not leave any numeric or technical token in a written format that would confuse a voiceover or text-to-speech engine.
  - Provide only the final transcript for voiceover—no markdown, no labels, no extra commentary.
  - Check for numeric mispronunciations before final output