{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import torch\n",
    "from pathlib import Path\n",
    "import re\n",
    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use the larger model for high-quality report generation\n",
    "DEFAULT_MODEL = \"meta-llama/Llama-3.3-70B-Instruct\" \n",
    "\n",
    "# Set up directories\n",
    "base_dir = Path(\"llama_data\")\n",
    "parsed_dir = base_dir / \"parsed_content\"\n",
    "reports_dir = base_dir / \"final_reports\"\n",
    "reports_dir.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: Fill in your system prompt here\n",
    "SYS_PROMPT = \"\"\"\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the model\n",
    "text_pipeline = pipeline(\n",
    "    \"text-generation\",\n",
    "    model=DEFAULT_MODEL,\n",
    "    model_kwargs={\"torch_dtype\": torch.bfloat16},\n",
    "    device_map=\"auto\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_enriched_reports():\n",
    "    \"\"\"Load the enriched reports from the previous step\"\"\"\n",
    "    try:\n",
    "        with open(parsed_dir / \"enriched_reports.json\", \"r\") as f:\n",
    "            return json.load(f)\n",
    "    except FileNotFoundError:\n",
    "        print(\"Error: enriched_reports.json not found. Please run Step 4 first.\")\n",
    "        return {}"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": "def load_enriched_reports():\n    \"\"\"Load the enriched reports from the previous step\"\"\"\n    try:\n        with open(parsed_dir / \"enriched_reports.json\", \"r\") as f:\n            enriched_reports = json.load(f)\n            \n        # Validate structure of the loaded data\n        if not enriched_reports:\n            print(\"Warning: enriched_reports.json is empty\")\n            return {}\n            \n        # Check one report to validate structure\n        sample_report_id = list(enriched_reports.keys())[0]\n        sample_report = enriched_reports[sample_report_id]\n        \n        # Output report structure for verification\n        print(f\"Loaded {len(enriched_reports)} reports\")\n        print(f\"Report structure validation for '{sample_report['report_title']}':\")\n        \n        # Check key components\n        print(\"- Metadata:\", \"Personality\" if \"personality\" in sample_report else \"MISSING\",\n              \"| Vibe\" if \"vibe\" in sample_report else \"MISSING\",\n              \"| Outline\" if \"outline_structure\" in sample_report else \"MISSING\")\n        \n        # Check queries structure\n        queries = sample_report.get(\"queries\", [])\n        if queries:\n            print(f\"- Queries: {len(queries)} found\")\n            # Check a sample query\n            if queries[0].get(\"relevant_results\"):\n                print(f\"  - Results: {len(queries[0]['relevant_results'])} found\")\n            else:\n                print(\"  - MISSING: relevant_results not found in queries\")\n        else:\n            print(\"- MISSING: No queries found\")\n        \n        return enriched_reports\n        \n    except FileNotFoundError:\n        print(\"Error: enriched_reports.json not found. Please run Step 4 first.\")\n        return {}\n    except json.JSONDecodeError:\n        print(\"Error: enriched_reports.json is not valid JSON.\")\n        return {}\n    except Exception as e:\n        print(f\"Error loading enriched reports: {str(e)}\")\n        return {}"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_relevant_findings_for_section(section_title, report_data):\n",
    "    \"\"\"Match the most relevant findings for a specific section\"\"\"\n",
    "    \n",
    "    all_findings = []\n",
    "    \n",
    "    # Collect all relevant research findings\n",
    "    for query in report_data.get(\"queries\", []):\n",
    "        for result in query.get(\"relevant_results\", []):\n",
    "            finding = {\n",
    "                \"title\": result[\"title\"],\n",
    "                \"url\": result[\"url\"],\n",
    "                \"key_points\": result.get(\"key_points\", []),\n",
    "                \"relevance\": result.get(\"relevance_score\", 0)\n",
    "            }\n",
    "            \n",
    "            # Simple keyword matching to find relevance to this section\n",
    "            # Convert to lowercase for case-insensitive matching\n",
    "            section_keywords = section_title.lower().split()\n",
    "            content_text = \" \".join(finding[\"key_points\"]).lower()\n",
    "            \n",
    "            # Count how many section keywords appear in the content\n",
    "            keyword_matches = sum(1 for keyword in section_keywords if keyword in content_text)\n",
    "            \n",
    "            # Add section relevance score\n",
    "            finding[\"section_relevance\"] = keyword_matches * 2 + finding[\"relevance\"]\n",
    "            \n",
    "            all_findings.append(finding)\n",
    "    \n",
    "    # Sort by relevance to this section and overall relevance\n",
    "    sorted_findings = sorted(all_findings, key=lambda x: x[\"section_relevance\"], reverse=True)\n",
    "    \n",
    "    # Return top findings (limit to avoid overwhelming the model)\n",
    "    return sorted_findings[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_full_report(report_id, report_data):\n",
    "    \"\"\"Generate a complete report using the enriched data\"\"\"\n",
    "    \n",
    "    report_title = report_data[\"report_title\"]\n",
    "    print(f\"Generating report: {report_title}\")\n",
    "    \n",
    "    # Get the outline structure, or create a default if not available\n",
    "    outline_structure = report_data.get(\"outline_structure\", [])\n",
    "    if not outline_structure:\n",
    "        # If no outline is available, create a generic one from the generated summary\n",
    "        suggested_sections = report_data.get(\"generated_summary\", {}).get(\"suggested_sections\", [])\n",
    "        if suggested_sections:\n",
    "            outline_structure = [\"Introduction\"] + suggested_sections + [\"Conclusion\"]\n",
    "        else:\n",
    "            outline_structure = [\"Introduction\", \"Key Features\", \"Benefits\", \"Applications\", \"Future Outlook\", \"Conclusion\"]\n",
    "    \n",
    "    # Start with title and vibe\n",
    "    report_content = f\"# {report_title}\\n\\n\"\n",
    "    report_content += f\"*{report_data.get('vibe', 'Informative and engaging')}*\\n\\n\"\n",
    "    \n",
    "    # Generate each section\n",
    "    for section in outline_structure:\n",
    "        print(f\"  Generating section: {section}\")\n",
    "        \n",
    "        # Get relevant findings for this section\n",
    "        relevant_findings = get_relevant_findings_for_section(section, report_data)\n",
    "        \n",
    "        # Generate the section content\n",
    "        section_content = generate_report_section(section, report_data, relevant_findings)\n",
    "        \n",
    "        # Add section heading and content to report\n",
    "        report_content += f\"## {section}\\n\\n\"\n",
    "        report_content += f\"{section_content}\\n\\n\"\n",
    "    \n",
    "    # Add sources section at the end\n",
    "    report_content += \"## Sources\\n\\n\"\n",
    "    \n",
    "    # Collect all unique sources\n",
    "    sources = set()\n",
    "    for query in report_data.get(\"queries\", []):\n",
    "        for result in query.get(\"relevant_results\", []):\n",
    "            sources.add(f\"- {result['title']}: {result['url']}\")\n",
    "    \n",
    "    # Add sources to report\n",
    "    report_content += \"\\n\".join(sorted(list(sources)))\n",
    "    \n",
    "    # Save the report\n",
    "    filename = f\"report_{report_id}_{report_title.replace(' ', '_')[:30]}.txt\"\n",
    "    report_path = reports_dir / filename\n",
    "    \n",
    "    with open(report_path, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(report_content)\n",
    "    \n",
    "    return report_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_all_reports():\n",
    "    \"\"\"Generate all reports from the enriched data\"\"\"\n",
    "    \n",
    "    # Load the enriched reports\n",
    "    enriched_reports = load_enriched_reports()\n",
    "    \n",
    "    if not enriched_reports:\n",
    "        print(\"No reports to generate.\")\n",
    "        return []\n",
    "    \n",
    "    report_paths = []\n",
    "    \n",
    "    # Generate each report\n",
    "    for report_id, report_data in enriched_reports.items():\n",
    "        report_path = generate_full_report(report_id, report_data)\n",
    "        report_paths.append(report_path)\n",
    "    \n",
    "    return report_paths"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": "print(\"Starting report generation...\")\n\n# First, validate and analyze the input data\nenriched_reports = load_enriched_reports()\n\nif enriched_reports:\n    print(\"\\nStarting generation of individual reports...\\n\")\n    report_paths = generate_all_reports()\n    \n    print(\"\\nReport generation complete!\")\n    print(f\"Generated {len(report_paths)} reports:\")\n    \n    for path in report_paths:\n        print(f\"- {path}\")\nelse:\n    print(\"Report generation skipped due to missing or invalid input data.\")"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}