{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "af993316-3e76-4c08-bd8c-1b87a9260545",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import torch\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from transformers import pipeline\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "id": "941581c9-46fc-4af9-84bd-9c467378b801",
   "metadata": {},
   "outputs": [],
   "source": "DEFAULT_MODEL = \"meta-llama/Llama-3.2-3B-Instruct\" \nbase_dir = Path(\"llama_data\")\nresults_dir = base_dir / \"results\"\nparsed_dir = base_dir / \"parsed_content\"\nparsed_dir.mkdir(exist_ok=True)\n\n# Minimum relevance score to keep content (used throughout the notebook)\nMIN_RELEVANCE_SCORE = 6"
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cae1a129-39ed-4269-b157-938c348d17b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "SYS_PROMPT = \"\"\"\n",
    "You are a smart AI Intern, you work with dumb AIs that dont know how to parse HTML. \n",
    "\n",
    "This is your moment to make mama GPU proud and secure a data centre! Remember shine and do your job well-you got this!\n",
    "\n",
    "Your task is to analyze the provided HTML content and extract the following in JSON format:\n",
    "1. main_content: The main article or content text (exclude navigation, footers, sidebars, ads)\n",
    "2. key_points: A list of 3-5 key points or takeaways from the content\n",
    "3. relevance_score: A score from 0-10 indicating relevance to the search query\n",
    "\n",
    "Return ONLY a valid JSON object with these fields, no additional text.\n",
    "If you cannot parse the HTML properly, return a JSON with error_message field.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3f4b1767-686e-41d7-a038-b3f348e047dd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "05495e1a316646599a62a9714543e4d2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda:0\n"
     ]
    }
   ],
   "source": [
    "text_pipeline = pipeline(\n",
    "    \"text-generation\",\n",
    "    model=DEFAULT_MODEL,\n",
    "    model_kwargs={\"torch_dtype\": torch.bfloat16},\n",
    "    device_map=\"auto\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "aa3d1949-4a32-4174-ae3d-729c232301e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_html_content(html_content):\n",
    "    soup = BeautifulSoup(html_content, 'html.parser')\n",
    "    #rmv these\n",
    "    for script in soup([\"script\", \"style\", \"nav\", \"footer\", \"aside\"]):\n",
    "        script.extract()\n",
    "    \n",
    "    text = soup.get_text(separator=' ', strip=True)\n",
    "    text = re.sub(r'\\s+', ' ', text).strip()\n",
    "    if len(text) > 110000:\n",
    "        text = text[:110000] + \"... [content truncated]\"\n",
    "    \n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "id": "22998a6f-68bc-4b02-8eaa-8fc3a369b43e",
   "metadata": {},
   "outputs": [],
   "source": "def parse_html_with_llm(html_path, query, purpose):\n    try:\n        # Load HTML\n        with open(html_path, \"r\", encoding=\"utf-8\") as f:\n            html_content = f.read()\n        cleaned_text = clean_html_content(html_content)\n        \n        # Construct prompt\n        conversation = [\n            {\"role\": \"system\", \"content\": SYS_PROMPT},\n            {\"role\": \"user\", \"content\": f\"\"\"\nSearch Query: {query}\nQuery Purpose: {purpose}\n\nHTML Content (cleaned):\n{cleaned_text}\n\nExtract the key information from this content in JSON format according to the instructions.\n\"\"\"}\n        ]\n        \n        output = text_pipeline(\n            conversation,\n            max_new_tokens=4000,  # Reduced from 32000 to a more reasonable size\n            temperature=0.01, # cool llm = smart extraction\n            do_sample=True,\n        )\n        \n        # Extract the assistant's response\n        assistant_response = output[0][\"generated_text\"][-1]\n        response_content = assistant_response[\"content\"]\n        \n        # Print short progress indicator instead of full content\n        print(f\"Processing {os.path.basename(html_path)}\")\n        \n        try:\n            json_match = re.search(r'({[\\s\\S]*})', response_content)\n            if json_match:\n                json_str = json_match.group(1)\n                parsed_data = json.loads(json_str)\n            else:\n                parsed_data = {\"error_message\": \"Failed to extract JSON from LLM response\"}\n        except json.JSONDecodeError:\n            parsed_data = {\"error_message\": \"Invalid JSON in LLM response\", \"raw_response\": response_content[:500]}\n        \n        return parsed_data\n        \n    except Exception as e:\n        print(f\"Error processing file: {str(e)}\")\n        return {\"error_message\": f\"Error processing file: {str(e)}\"}"
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a7537d52-6682-41ed-b5f7-a447c8f60d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_all_search_results():\n",
    "    with open(base_dir / \"results_so_far.json\", \"r\") as f:\n",
    "        search_results = json.load(f)\n",
    "    \n",
    "    all_parsed_results = []\n",
    "    \n",
    "    for query_data in search_results:\n",
    "        report_index = query_data[\"report_index\"]\n",
    "        report_title = query_data[\"report_title\"]\n",
    "        query_index = query_data[\"query_index\"]\n",
    "        query = query_data[\"query\"]\n",
    "        purpose = query_data[\"purpose\"]\n",
    "\n",
    "        report_dir_name = f\"report_{report_index}_{report_title.replace(' ', '_').replace(':', '').replace('/', '')[:30]}\"\n",
    "        query_dir_name = f\"query_{query_index}_{query.replace(' ', '_').replace(':', '').replace('/', '')[:30]}\"\n",
    "        parsed_report_dir = parsed_dir / report_dir_name\n",
    "        parsed_report_dir.mkdir(exist_ok=True)\n",
    "        \n",
    "        parsed_query_results = []\n",
    "        \n",
    "        print(f\"\\nProcessing results for query: {query}\")\n",
    "        \n",
    "        for result in query_data[\"results\"]:\n",
    "            result_index = result[\"result_index\"]\n",
    "            title = result[\"title\"]\n",
    "            url = result[\"url\"]\n",
    "            filepath = result[\"filepath\"]\n",
    "            \n",
    "            print(f\"  Processing result {result_index + 1}: {title[:50]}...\")\n",
    "            \n",
    "            if filepath and os.path.exists(filepath):\n",
    "                parsed_data = parse_html_with_llm(filepath, query, purpose)\n",
    "                parsed_data.update({\n",
    "                    \"result_index\": result_index,\n",
    "                    \"title\": title,\n",
    "                    \"url\": url,\n",
    "                    \"query\": query,\n",
    "                    \"purpose\": purpose\n",
    "                })\n",
    "                \n",
    "                result_filename = f\"parsed_result_{result_index}.json\"\n",
    "                with open(parsed_report_dir / result_filename, \"w\") as f:\n",
    "                    json.dump(parsed_data, f, indent=2)\n",
    "                \n",
    "                parsed_query_results.append(parsed_data)\n",
    "            else:\n",
    "                print(f\"    Warning: File not found - {filepath}\")\n",
    "        \n",
    "        query_results = {\n",
    "            \"report_index\": report_index,\n",
    "            \"report_title\": report_title,\n",
    "            \"query_index\": query_index,\n",
    "            \"query\": query,\n",
    "            \"purpose\": purpose,\n",
    "            \"parsed_results\": parsed_query_results\n",
    "        }\n",
    "        \n",
    "        query_filename = f\"parsed_query_{query_index}.json\"\n",
    "        with open(parsed_report_dir / query_filename, \"w\") as f:\n",
    "            json.dump(query_results, f, indent=2)\n",
    "        \n",
    "        all_parsed_results.append(query_results)\n",
    "    \n",
    "    with open(parsed_dir / \"all_parsed_results.json\", \"w\") as f:\n",
    "        json.dump(all_parsed_results, f, indent=2)\n",
    "    \n",
    "    return all_parsed_results"
   ]
  },
  {
   "cell_type": "code",
   "id": "deb15c42-6cdd-4fa7-8823-cce4b17732d7",
   "metadata": {},
   "outputs": [],
   "source": "def generate_report_summaries(all_parsed_results):\n    # Load original outlines and metadata\n    try:\n        with open('generated_outlines.json', 'r') as f:\n            original_outlines = json.load(f)\n    except FileNotFoundError:\n        print(\"Warning: generated_outlines.json not found. Proceeding without original outlines.\")\n        original_outlines = []\n    \n    # Create a dictionary mapping report_index to original outline data\n    original_data_by_index = {}\n    if original_outlines:\n        for i, outline in enumerate(original_outlines):\n            original_data_by_index[i] = {\n                \"original_goal\": outline.get(\"original_goal\", {}),\n                \"personality\": outline.get(\"personality\", {}),\n                \"vibe\": outline.get(\"vibe\", \"\"),\n                \"outline_structure\": outline.get(\"outline\", []),\n                \"web_queries\": outline.get(\"Web Queries\", [])\n            }\n    \n    report_summaries = {}\n    \n    # Minimum relevance score to keep content\n    MIN_RELEVANCE_SCORE = 6\n    \n    for query_result in all_parsed_results:\n        report_index = query_result[\"report_index\"]\n        report_title = query_result[\"report_title\"]\n        \n        if report_index not in report_summaries:\n            report_summaries[report_index] = {\n                \"report_title\": report_title,\n                \"queries\": [],\n                # Add original data if available\n                **(original_data_by_index.get(report_index, {}))\n            }\n        \n        # Filter out low-relevance results\n        filtered_results = [\n            r for r in query_result[\"parsed_results\"] \n            if r.get(\"relevance_score\", 0) >= MIN_RELEVANCE_SCORE\n        ]\n        \n        if not filtered_results:\n            print(f\"Warning: No high-relevance results for query: {query_result['query']}\")\n            # Skip this query if it has no relevant results\n            continue\n        \n        query_summary = {\n            \"query\": query_result[\"query\"],\n            \"purpose\": query_result[\"purpose\"],\n            \"result_count\": len(filtered_results),\n            \"average_relevance\": sum(r.get(\"relevance_score\", 0) for r in filtered_results) / \n                              max(1, len(filtered_results)),\n            \"relevant_results\": [\n                {\n                    \"title\": r[\"title\"],\n                    \"url\": r[\"url\"],\n                    \"main_content\": r.get(\"main_content\", \"No content available\"),\n                    \"key_points\": r.get(\"key_points\", []),\n                    \"relevance_score\": r.get(\"relevance_score\", 0)\n                }\n                for r in sorted(\n                    filtered_results, \n                    key=lambda x: x.get(\"relevance_score\", 0), \n                    reverse=True\n                )\n            ]\n        }\n        \n        report_summaries[report_index][\"queries\"].append(query_summary)\n    \n    for report_index, report_data in report_summaries.items():\n        print(f\"\\nGenerating summary for report: {report_data['report_title']}\")\n        \n        # Construct summary prompt\n        queries_info = \"\\n\\n\".join([\n            f\"Query: {q['query']}\\nPurpose: {q['purpose']}\\nTop Results:\\n\" + \n            \"\\n\".join([f\"- {r['title']}: {' '.join(r['key_points'][:2])}\" for r in q[\"relevant_results\"][:3]])\n            for q in report_data[\"queries\"]\n        ])\n        \n        summary_prompt = f\"\"\"\nReport Title: {report_data['report_title']}\n\nThe following searches were conducted for this report:\n\n{queries_info}\n\nBased on these search results, generate a brief report outline with:\n1. Key findings across all queries\n2. Important data points uncovered\n3. Suggested sections for the final report\n4. Areas where more research might be needed\n\nReturn this as a JSON with fields: key_findings, data_points, suggested_sections, and research_gaps.\n\"\"\"\n        \n        conversation = [\n            {\"role\": \"system\", \"content\": \"You are a research assistant who helps summarize findings from web searches into structured report outlines.\"},\n            {\"role\": \"user\", \"content\": summary_prompt}\n        ]\n        \n        # Generate report summary\n        output = text_pipeline(\n            conversation,\n            max_new_tokens=4000,  # Reduced from 32000\n            temperature=0.1,\n        )\n        \n        # Extract the assistant's response\n        assistant_response = output[0][\"generated_text\"][-1]\n        response_content = assistant_response[\"content\"]\n        \n        # Extract JSON from response\n        try:\n            json_match = re.search(r'({[\\s\\S]*})', response_content)\n            if json_match:\n                json_str = json_match.group(1)\n                report_summary = json.loads(json_str)\n            else:\n                report_summary = {\"error\": \"Failed to extract JSON from LLM response\"}\n        except json.JSONDecodeError:\n            report_summary = {\"error\": \"Invalid JSON in LLM response\"}\n        \n        report_data[\"generated_summary\"] = report_summary\n    \n    # Save enriched report summaries with all original context\n    enriched_reports_path = parsed_dir / \"enriched_reports.json\"\n    with open(enriched_reports_path, \"w\") as f:\n        json.dump(report_summaries, f, indent=2)\n    \n    # Also save a simplified version for backward compatibility\n    with open(parsed_dir / \"report_summaries.json\", \"w\") as f:\n        json.dump(report_summaries, f, indent=2)\n    \n    print(f\"\\nEnriched reports saved to: {enriched_reports_path}\")\n    return report_summaries"
  },
  {
   "cell_type": "code",
   "id": "0c00ef8b-bd83-4b31-96ba-86832a13774c",
   "metadata": {},
   "outputs": [],
   "source": "print(\"Starting HTML parsing process with LLM...\")\nall_parsed_results = process_all_search_results()\n\nprint(\"\\nGenerating report summaries...\")\nreport_summaries = generate_report_summaries(all_parsed_results)\n\nprint(\"\\nProcessing complete. Results saved to:\")\nprint(f\"- All parsed results: {parsed_dir / 'all_parsed_results.json'}\")\nprint(f\"- Enriched reports with original context: {parsed_dir / 'enriched_reports.json'}\")\n\ntotal_queries = len(all_parsed_results)\ntotal_results = sum(len(query[\"parsed_results\"]) for query in all_parsed_results)\ntotal_reports = len(report_summaries)\n\nprint(f\"\\nSummary Statistics:\")\nprint(f\"- Total Reports: {total_reports}\")\nprint(f\"- Total Queries: {total_queries}\")\nprint(f\"- Total Results Parsed: {total_results}\")\n\n# Calculate statistics only on high-relevance results\nMIN_RELEVANCE_SCORE = 6\nhigh_relevance_results = [\n    result\n    for query in all_parsed_results \n    for result in query[\"parsed_results\"]\n    if result.get(\"relevance_score\", 0) >= MIN_RELEVANCE_SCORE\n]\n\ntotal_high_relevance = len(high_relevance_results)\nprint(f\"- High Relevance Results (score >= {MIN_RELEVANCE_SCORE}): {total_high_relevance}\")\n\nif high_relevance_results:\n    avg_relevance = sum(result.get(\"relevance_score\", 0) for result in high_relevance_results) / total_high_relevance\n    print(f\"- Average Relevance Score (high relevance only): {avg_relevance:.2f}/10\")\n\n# Display a sample of one report's structure (to verify format)\nif report_summaries:\n    sample_report_index = list(report_summaries.keys())[0]\n    sample_report = report_summaries[sample_report_index]\n    print(f\"\\nSample Structure for Report '{sample_report['report_title']}':\")\n    print(\"- Original metadata included\")\n    print(\"- Queries with filtered high-relevance results\")\n    print(\"- Generated summary included\")\n    print(\"Ready for the next step in the workflow!\")"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fca3762-2400-4729-b1e5-4e8c069ecf01",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}