{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import torch\n", "from pathlib import Path\n", "import re\n", "from transformers import pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Use the larger model for high-quality report generation\n", "DEFAULT_MODEL = \"meta-llama/Llama-3.3-70B-Instruct\" \n", "\n", "# Set up directories\n", "base_dir = Path(\"llama_data\")\n", "parsed_dir = base_dir / \"parsed_content\"\n", "reports_dir = base_dir / \"final_reports\"\n", "reports_dir.mkdir(exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO: Fill in your system prompt here\n", "SYS_PROMPT = \"\"\"\n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Initialize the model\n", "text_pipeline = pipeline(\n", " \"text-generation\",\n", " model=DEFAULT_MODEL,\n", " model_kwargs={\"torch_dtype\": torch.bfloat16},\n", " device_map=\"auto\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_enriched_reports():\n", " \"\"\"Load the enriched reports from the previous step\"\"\"\n", " try:\n", " with open(parsed_dir / \"enriched_reports.json\", \"r\") as f:\n", " return json.load(f)\n", " except FileNotFoundError:\n", " print(\"Error: enriched_reports.json not found. Please run Step 4 first.\")\n", " return {}" ] }, { "cell_type": "code", "metadata": {}, "outputs": [], "source": "def load_enriched_reports():\n \"\"\"Load the enriched reports from the previous step\"\"\"\n try:\n with open(parsed_dir / \"enriched_reports.json\", \"r\") as f:\n enriched_reports = json.load(f)\n \n # Validate structure of the loaded data\n if not enriched_reports:\n print(\"Warning: enriched_reports.json is empty\")\n return {}\n \n # Check one report to validate structure\n sample_report_id = list(enriched_reports.keys())[0]\n sample_report = enriched_reports[sample_report_id]\n \n # Output report structure for verification\n print(f\"Loaded {len(enriched_reports)} reports\")\n print(f\"Report structure validation for '{sample_report['report_title']}':\")\n \n # Check key components\n print(\"- Metadata:\", \"Personality\" if \"personality\" in sample_report else \"MISSING\",\n \"| Vibe\" if \"vibe\" in sample_report else \"MISSING\",\n \"| Outline\" if \"outline_structure\" in sample_report else \"MISSING\")\n \n # Check queries structure\n queries = sample_report.get(\"queries\", [])\n if queries:\n print(f\"- Queries: {len(queries)} found\")\n # Check a sample query\n if queries[0].get(\"relevant_results\"):\n print(f\" - Results: {len(queries[0]['relevant_results'])} found\")\n else:\n print(\" - MISSING: relevant_results not found in queries\")\n else:\n print(\"- MISSING: No queries found\")\n \n return enriched_reports\n \n except FileNotFoundError:\n print(\"Error: enriched_reports.json not found. Please run Step 4 first.\")\n return {}\n except json.JSONDecodeError:\n print(\"Error: enriched_reports.json is not valid JSON.\")\n return {}\n except Exception as e:\n print(f\"Error loading enriched reports: {str(e)}\")\n return {}" }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_relevant_findings_for_section(section_title, report_data):\n", " \"\"\"Match the most relevant findings for a specific section\"\"\"\n", " \n", " all_findings = []\n", " \n", " # Collect all relevant research findings\n", " for query in report_data.get(\"queries\", []):\n", " for result in query.get(\"relevant_results\", []):\n", " finding = {\n", " \"title\": result[\"title\"],\n", " \"url\": result[\"url\"],\n", " \"key_points\": result.get(\"key_points\", []),\n", " \"relevance\": result.get(\"relevance_score\", 0)\n", " }\n", " \n", " # Simple keyword matching to find relevance to this section\n", " # Convert to lowercase for case-insensitive matching\n", " section_keywords = section_title.lower().split()\n", " content_text = \" \".join(finding[\"key_points\"]).lower()\n", " \n", " # Count how many section keywords appear in the content\n", " keyword_matches = sum(1 for keyword in section_keywords if keyword in content_text)\n", " \n", " # Add section relevance score\n", " finding[\"section_relevance\"] = keyword_matches * 2 + finding[\"relevance\"]\n", " \n", " all_findings.append(finding)\n", " \n", " # Sort by relevance to this section and overall relevance\n", " sorted_findings = sorted(all_findings, key=lambda x: x[\"section_relevance\"], reverse=True)\n", " \n", " # Return top findings (limit to avoid overwhelming the model)\n", " return sorted_findings[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_full_report(report_id, report_data):\n", " \"\"\"Generate a complete report using the enriched data\"\"\"\n", " \n", " report_title = report_data[\"report_title\"]\n", " print(f\"Generating report: {report_title}\")\n", " \n", " # Get the outline structure, or create a default if not available\n", " outline_structure = report_data.get(\"outline_structure\", [])\n", " if not outline_structure:\n", " # If no outline is available, create a generic one from the generated summary\n", " suggested_sections = report_data.get(\"generated_summary\", {}).get(\"suggested_sections\", [])\n", " if suggested_sections:\n", " outline_structure = [\"Introduction\"] + suggested_sections + [\"Conclusion\"]\n", " else:\n", " outline_structure = [\"Introduction\", \"Key Features\", \"Benefits\", \"Applications\", \"Future Outlook\", \"Conclusion\"]\n", " \n", " # Start with title and vibe\n", " report_content = f\"# {report_title}\\n\\n\"\n", " report_content += f\"*{report_data.get('vibe', 'Informative and engaging')}*\\n\\n\"\n", " \n", " # Generate each section\n", " for section in outline_structure:\n", " print(f\" Generating section: {section}\")\n", " \n", " # Get relevant findings for this section\n", " relevant_findings = get_relevant_findings_for_section(section, report_data)\n", " \n", " # Generate the section content\n", " section_content = generate_report_section(section, report_data, relevant_findings)\n", " \n", " # Add section heading and content to report\n", " report_content += f\"## {section}\\n\\n\"\n", " report_content += f\"{section_content}\\n\\n\"\n", " \n", " # Add sources section at the end\n", " report_content += \"## Sources\\n\\n\"\n", " \n", " # Collect all unique sources\n", " sources = set()\n", " for query in report_data.get(\"queries\", []):\n", " for result in query.get(\"relevant_results\", []):\n", " sources.add(f\"- {result['title']}: {result['url']}\")\n", " \n", " # Add sources to report\n", " report_content += \"\\n\".join(sorted(list(sources)))\n", " \n", " # Save the report\n", " filename = f\"report_{report_id}_{report_title.replace(' ', '_')[:30]}.txt\"\n", " report_path = reports_dir / filename\n", " \n", " with open(report_path, \"w\", encoding=\"utf-8\") as f:\n", " f.write(report_content)\n", " \n", " return report_path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_all_reports():\n", " \"\"\"Generate all reports from the enriched data\"\"\"\n", " \n", " # Load the enriched reports\n", " enriched_reports = load_enriched_reports()\n", " \n", " if not enriched_reports:\n", " print(\"No reports to generate.\")\n", " return []\n", " \n", " report_paths = []\n", " \n", " # Generate each report\n", " for report_id, report_data in enriched_reports.items():\n", " report_path = generate_full_report(report_id, report_data)\n", " report_paths.append(report_path)\n", " \n", " return report_paths" ] }, { "cell_type": "code", "metadata": {}, "outputs": [], "source": "print(\"Starting report generation...\")\n\n# First, validate and analyze the input data\nenriched_reports = load_enriched_reports()\n\nif enriched_reports:\n print(\"\\nStarting generation of individual reports...\\n\")\n report_paths = generate_all_reports()\n \n print(\"\\nReport generation complete!\")\n print(f\"Generated {len(report_paths)} reports:\")\n \n for path in report_paths:\n print(f\"- {path}\")\nelse:\n print(\"Report generation skipped due to missing or invalid input data.\")" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }