In [None]:
import json
import os
import torch
from pathlib import Path
import re
from transformers import pipeline

In [None]:
# Use the larger model for high-quality report generation
DEFAULT_MODEL = "meta-llama/Llama-3.3-70B-Instruct" 

# Set up directories
base_dir = Path("llama_data")
parsed_dir = base_dir / "parsed_content"
reports_dir = base_dir / "final_reports"
reports_dir.mkdir(exist_ok=True)

In [None]:
# TODO: Fill in your system prompt here
SYS_PROMPT = """

"""

In [None]:
# Initialize the model
text_pipeline = pipeline(
 "text-generation",
 model=DEFAULT_MODEL,
 model_kwargs={"torch_dtype": torch.bfloat16},
 device_map="auto",
)

In [None]:
def load_enriched_reports():
 """Load the enriched reports from the previous step"""
 try:
 with open(parsed_dir / "enriched_reports.json", "r") as f:
 return json.load(f)
 except FileNotFoundError:
 print("Error: enriched_reports.json not found. Please run Step 4 first.")
 return {}

In [ ]:
def load_enriched_reports():
 """Load the enriched reports from the previous step"""
 try:
 with open(parsed_dir / "enriched_reports.json", "r") as f:
 enriched_reports = json.load(f)
 
 # Validate structure of the loaded data
 if not enriched_reports:
 print("Warning: enriched_reports.json is empty")
 return {}
 
 # Check one report to validate structure
 sample_report_id = list(enriched_reports.keys())[0]
 sample_report = enriched_reports[sample_report_id]
 
 # Output report structure for verification
 print(f"Loaded {len(enriched_reports)} reports")
 print(f"Report structure validation for '{sample_report['report_title']}':")
 
 # Check key components
 print("- Metadata:", "Personality" if "personality" in sample_report else "MISSING",
 "| Vibe" if "vibe" in sample_report else "MISSING",
 "| Outline" if "outline_structure" in sample_report else "MISSING")
 
 # Check queries structure
 queries = sample_report.get("queries", [])
 if queries:
 print(f"- Queries: {len(queries)} found")
 # Check a sample query
 if queries[0].get("relevant_results"):
 print(f" - Results: {len(queries[0]['relevant_results'])} found")
 else:
 print(" - MISSING: relevant_results not found in queries")
 else:
 print("- MISSING: No queries found")
 
 return enriched_reports
 
 except FileNotFoundError:
 print("Error: enriched_reports.json not found. Please run Step 4 first.")
 return {}
 except json.JSONDecodeError:
 print("Error: enriched_reports.json is not valid JSON.")
 return {}
 except Exception as e:
 print(f"Error loading enriched reports: {str(e)}")
 return {}

In [None]:
def get_relevant_findings_for_section(section_title, report_data):
 """Match the most relevant findings for a specific section"""
 
 all_findings = []
 
 # Collect all relevant research findings
 for query in report_data.get("queries", []):
 for result in query.get("relevant_results", []):
 finding = {
 "title": result["title"],
 "url": result["url"],
 "key_points": result.get("key_points", []),
 "relevance": result.get("relevance_score", 0)
 }
 
 # Simple keyword matching to find relevance to this section
 # Convert to lowercase for case-insensitive matching
 section_keywords = section_title.lower().split()
 content_text = " ".join(finding["key_points"]).lower()
 
 # Count how many section keywords appear in the content
 keyword_matches = sum(1 for keyword in section_keywords if keyword in content_text)
 
 # Add section relevance score
 finding["section_relevance"] = keyword_matches * 2 + finding["relevance"]
 
 all_findings.append(finding)
 
 # Sort by relevance to this section and overall relevance
 sorted_findings = sorted(all_findings, key=lambda x: x["section_relevance"], reverse=True)
 
 # Return top findings (limit to avoid overwhelming the model)
 return sorted_findings[:5]

In [None]:
def generate_full_report(report_id, report_data):
 """Generate a complete report using the enriched data"""
 
 report_title = report_data["report_title"]
 print(f"Generating report: {report_title}")
 
 # Get the outline structure, or create a default if not available
 outline_structure = report_data.get("outline_structure", [])
 if not outline_structure:
 # If no outline is available, create a generic one from the generated summary
 suggested_sections = report_data.get("generated_summary", {}).get("suggested_sections", [])
 if suggested_sections:
 outline_structure = ["Introduction"] + suggested_sections + ["Conclusion"]
 else:
 outline_structure = ["Introduction", "Key Features", "Benefits", "Applications", "Future Outlook", "Conclusion"]
 
 # Start with title and vibe
 report_content = f"# {report_title}\n\n"
 report_content += f"*{report_data.get('vibe', 'Informative and engaging')}*\n\n"
 
 # Generate each section
 for section in outline_structure:
 print(f" Generating section: {section}")
 
 # Get relevant findings for this section
 relevant_findings = get_relevant_findings_for_section(section, report_data)
 
 # Generate the section content
 section_content = generate_report_section(section, report_data, relevant_findings)
 
 # Add section heading and content to report
 report_content += f"## {section}\n\n"
 report_content += f"{section_content}\n\n"
 
 # Add sources section at the end
 report_content += "## Sources\n\n"
 
 # Collect all unique sources
 sources = set()
 for query in report_data.get("queries", []):
 for result in query.get("relevant_results", []):
 sources.add(f"- {result['title']}: {result['url']}")
 
 # Add sources to report
 report_content += "\n".join(sorted(list(sources)))
 
 # Save the report
 filename = f"report_{report_id}_{report_title.replace(' ', '_')[:30]}.txt"
 report_path = reports_dir / filename
 
 with open(report_path, "w", encoding="utf-8") as f:
 f.write(report_content)
 
 return report_path

In [None]:
def generate_all_reports():
 """Generate all reports from the enriched data"""
 
 # Load the enriched reports
 enriched_reports = load_enriched_reports()
 
 if not enriched_reports:
 print("No reports to generate.")
 return []
 
 report_paths = []
 
 # Generate each report
 for report_id, report_data in enriched_reports.items():
 report_path = generate_full_report(report_id, report_data)
 report_paths.append(report_path)
 
 return report_paths

In [ ]:
print("Starting report generation...")

# First, validate and analyze the input data
enriched_reports = load_enriched_reports()

if enriched_reports:
 print("\nStarting generation of individual reports...\n")
 report_paths = generate_all_reports()
 
 print("\nReport generation complete!")
 print(f"Generated {len(report_paths)} reports:")
 
 for path in report_paths:
 print(f"- {path}")
else:
 print("Report generation skipped due to missing or invalid input data.")