{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "af993316-3e76-4c08-bd8c-1b87a9260545", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import torch\n", "from pathlib import Path\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "from transformers import pipeline\n", "import re" ] }, { "cell_type": "code", "id": "941581c9-46fc-4af9-84bd-9c467378b801", "metadata": {}, "outputs": [], "source": "DEFAULT_MODEL = \"meta-llama/Llama-3.2-3B-Instruct\" \nbase_dir = Path(\"llama_data\")\nresults_dir = base_dir / \"results\"\nparsed_dir = base_dir / \"parsed_content\"\nparsed_dir.mkdir(exist_ok=True)\n\n# Minimum relevance score to keep content (used throughout the notebook)\nMIN_RELEVANCE_SCORE = 6" }, { "cell_type": "code", "execution_count": 4, "id": "cae1a129-39ed-4269-b157-938c348d17b8", "metadata": {}, "outputs": [], "source": [ "SYS_PROMPT = \"\"\"\n", "You are a smart AI Intern, you work with dumb AIs that dont know how to parse HTML. \n", "\n", "This is your moment to make mama GPU proud and secure a data centre! Remember shine and do your job well-you got this!\n", "\n", "Your task is to analyze the provided HTML content and extract the following in JSON format:\n", "1. main_content: The main article or content text (exclude navigation, footers, sidebars, ads)\n", "2. key_points: A list of 3-5 key points or takeaways from the content\n", "3. relevance_score: A score from 0-10 indicating relevance to the search query\n", "\n", "Return ONLY a valid JSON object with these fields, no additional text.\n", "If you cannot parse the HTML properly, return a JSON with error_message field.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "3f4b1767-686e-41d7-a038-b3f348e047dd", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "05495e1a316646599a62a9714543e4d2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00 110000:\n", " text = text[:110000] + \"... [content truncated]\"\n", " \n", " return text" ] }, { "cell_type": "code", "id": "22998a6f-68bc-4b02-8eaa-8fc3a369b43e", "metadata": {}, "outputs": [], "source": "def parse_html_with_llm(html_path, query, purpose):\n try:\n # Load HTML\n with open(html_path, \"r\", encoding=\"utf-8\") as f:\n html_content = f.read()\n cleaned_text = clean_html_content(html_content)\n \n # Construct prompt\n conversation = [\n {\"role\": \"system\", \"content\": SYS_PROMPT},\n {\"role\": \"user\", \"content\": f\"\"\"\nSearch Query: {query}\nQuery Purpose: {purpose}\n\nHTML Content (cleaned):\n{cleaned_text}\n\nExtract the key information from this content in JSON format according to the instructions.\n\"\"\"}\n ]\n \n output = text_pipeline(\n conversation,\n max_new_tokens=4000, # Reduced from 32000 to a more reasonable size\n temperature=0.01, # cool llm = smart extraction\n do_sample=True,\n )\n \n # Extract the assistant's response\n assistant_response = output[0][\"generated_text\"][-1]\n response_content = assistant_response[\"content\"]\n \n # Print short progress indicator instead of full content\n print(f\"Processing {os.path.basename(html_path)}\")\n \n try:\n json_match = re.search(r'({[\\s\\S]*})', response_content)\n if json_match:\n json_str = json_match.group(1)\n parsed_data = json.loads(json_str)\n else:\n parsed_data = {\"error_message\": \"Failed to extract JSON from LLM response\"}\n except json.JSONDecodeError:\n parsed_data = {\"error_message\": \"Invalid JSON in LLM response\", \"raw_response\": response_content[:500]}\n \n return parsed_data\n \n except Exception as e:\n print(f\"Error processing file: {str(e)}\")\n return {\"error_message\": f\"Error processing file: {str(e)}\"}" }, { "cell_type": "code", "execution_count": 8, "id": "a7537d52-6682-41ed-b5f7-a447c8f60d7b", "metadata": {}, "outputs": [], "source": [ "def process_all_search_results():\n", " with open(base_dir / \"results_so_far.json\", \"r\") as f:\n", " search_results = json.load(f)\n", " \n", " all_parsed_results = []\n", " \n", " for query_data in search_results:\n", " report_index = query_data[\"report_index\"]\n", " report_title = query_data[\"report_title\"]\n", " query_index = query_data[\"query_index\"]\n", " query = query_data[\"query\"]\n", " purpose = query_data[\"purpose\"]\n", "\n", " report_dir_name = f\"report_{report_index}_{report_title.replace(' ', '_').replace(':', '').replace('/', '')[:30]}\"\n", " query_dir_name = f\"query_{query_index}_{query.replace(' ', '_').replace(':', '').replace('/', '')[:30]}\"\n", " parsed_report_dir = parsed_dir / report_dir_name\n", " parsed_report_dir.mkdir(exist_ok=True)\n", " \n", " parsed_query_results = []\n", " \n", " print(f\"\\nProcessing results for query: {query}\")\n", " \n", " for result in query_data[\"results\"]:\n", " result_index = result[\"result_index\"]\n", " title = result[\"title\"]\n", " url = result[\"url\"]\n", " filepath = result[\"filepath\"]\n", " \n", " print(f\" Processing result {result_index + 1}: {title[:50]}...\")\n", " \n", " if filepath and os.path.exists(filepath):\n", " parsed_data = parse_html_with_llm(filepath, query, purpose)\n", " parsed_data.update({\n", " \"result_index\": result_index,\n", " \"title\": title,\n", " \"url\": url,\n", " \"query\": query,\n", " \"purpose\": purpose\n", " })\n", " \n", " result_filename = f\"parsed_result_{result_index}.json\"\n", " with open(parsed_report_dir / result_filename, \"w\") as f:\n", " json.dump(parsed_data, f, indent=2)\n", " \n", " parsed_query_results.append(parsed_data)\n", " else:\n", " print(f\" Warning: File not found - {filepath}\")\n", " \n", " query_results = {\n", " \"report_index\": report_index,\n", " \"report_title\": report_title,\n", " \"query_index\": query_index,\n", " \"query\": query,\n", " \"purpose\": purpose,\n", " \"parsed_results\": parsed_query_results\n", " }\n", " \n", " query_filename = f\"parsed_query_{query_index}.json\"\n", " with open(parsed_report_dir / query_filename, \"w\") as f:\n", " json.dump(query_results, f, indent=2)\n", " \n", " all_parsed_results.append(query_results)\n", " \n", " with open(parsed_dir / \"all_parsed_results.json\", \"w\") as f:\n", " json.dump(all_parsed_results, f, indent=2)\n", " \n", " return all_parsed_results" ] }, { "cell_type": "code", "id": "deb15c42-6cdd-4fa7-8823-cce4b17732d7", "metadata": {}, "outputs": [], "source": "def generate_report_summaries(all_parsed_results):\n # Load original outlines and metadata\n try:\n with open('generated_outlines.json', 'r') as f:\n original_outlines = json.load(f)\n except FileNotFoundError:\n print(\"Warning: generated_outlines.json not found. Proceeding without original outlines.\")\n original_outlines = []\n \n # Create a dictionary mapping report_index to original outline data\n original_data_by_index = {}\n if original_outlines:\n for i, outline in enumerate(original_outlines):\n original_data_by_index[i] = {\n \"original_goal\": outline.get(\"original_goal\", {}),\n \"personality\": outline.get(\"personality\", {}),\n \"vibe\": outline.get(\"vibe\", \"\"),\n \"outline_structure\": outline.get(\"outline\", []),\n \"web_queries\": outline.get(\"Web Queries\", [])\n }\n \n report_summaries = {}\n \n # Minimum relevance score to keep content\n MIN_RELEVANCE_SCORE = 6\n \n for query_result in all_parsed_results:\n report_index = query_result[\"report_index\"]\n report_title = query_result[\"report_title\"]\n \n if report_index not in report_summaries:\n report_summaries[report_index] = {\n \"report_title\": report_title,\n \"queries\": [],\n # Add original data if available\n **(original_data_by_index.get(report_index, {}))\n }\n \n # Filter out low-relevance results\n filtered_results = [\n r for r in query_result[\"parsed_results\"] \n if r.get(\"relevance_score\", 0) >= MIN_RELEVANCE_SCORE\n ]\n \n if not filtered_results:\n print(f\"Warning: No high-relevance results for query: {query_result['query']}\")\n # Skip this query if it has no relevant results\n continue\n \n query_summary = {\n \"query\": query_result[\"query\"],\n \"purpose\": query_result[\"purpose\"],\n \"result_count\": len(filtered_results),\n \"average_relevance\": sum(r.get(\"relevance_score\", 0) for r in filtered_results) / \n max(1, len(filtered_results)),\n \"relevant_results\": [\n {\n \"title\": r[\"title\"],\n \"url\": r[\"url\"],\n \"main_content\": r.get(\"main_content\", \"No content available\"),\n \"key_points\": r.get(\"key_points\", []),\n \"relevance_score\": r.get(\"relevance_score\", 0)\n }\n for r in sorted(\n filtered_results, \n key=lambda x: x.get(\"relevance_score\", 0), \n reverse=True\n )\n ]\n }\n \n report_summaries[report_index][\"queries\"].append(query_summary)\n \n for report_index, report_data in report_summaries.items():\n print(f\"\\nGenerating summary for report: {report_data['report_title']}\")\n \n # Construct summary prompt\n queries_info = \"\\n\\n\".join([\n f\"Query: {q['query']}\\nPurpose: {q['purpose']}\\nTop Results:\\n\" + \n \"\\n\".join([f\"- {r['title']}: {' '.join(r['key_points'][:2])}\" for r in q[\"relevant_results\"][:3]])\n for q in report_data[\"queries\"]\n ])\n \n summary_prompt = f\"\"\"\nReport Title: {report_data['report_title']}\n\nThe following searches were conducted for this report:\n\n{queries_info}\n\nBased on these search results, generate a brief report outline with:\n1. Key findings across all queries\n2. Important data points uncovered\n3. Suggested sections for the final report\n4. Areas where more research might be needed\n\nReturn this as a JSON with fields: key_findings, data_points, suggested_sections, and research_gaps.\n\"\"\"\n \n conversation = [\n {\"role\": \"system\", \"content\": \"You are a research assistant who helps summarize findings from web searches into structured report outlines.\"},\n {\"role\": \"user\", \"content\": summary_prompt}\n ]\n \n # Generate report summary\n output = text_pipeline(\n conversation,\n max_new_tokens=4000, # Reduced from 32000\n temperature=0.1,\n )\n \n # Extract the assistant's response\n assistant_response = output[0][\"generated_text\"][-1]\n response_content = assistant_response[\"content\"]\n \n # Extract JSON from response\n try:\n json_match = re.search(r'({[\\s\\S]*})', response_content)\n if json_match:\n json_str = json_match.group(1)\n report_summary = json.loads(json_str)\n else:\n report_summary = {\"error\": \"Failed to extract JSON from LLM response\"}\n except json.JSONDecodeError:\n report_summary = {\"error\": \"Invalid JSON in LLM response\"}\n \n report_data[\"generated_summary\"] = report_summary\n \n # Save enriched report summaries with all original context\n enriched_reports_path = parsed_dir / \"enriched_reports.json\"\n with open(enriched_reports_path, \"w\") as f:\n json.dump(report_summaries, f, indent=2)\n \n # Also save a simplified version for backward compatibility\n with open(parsed_dir / \"report_summaries.json\", \"w\") as f:\n json.dump(report_summaries, f, indent=2)\n \n print(f\"\\nEnriched reports saved to: {enriched_reports_path}\")\n return report_summaries" }, { "cell_type": "code", "id": "0c00ef8b-bd83-4b31-96ba-86832a13774c", "metadata": {}, "outputs": [], "source": "print(\"Starting HTML parsing process with LLM...\")\nall_parsed_results = process_all_search_results()\n\nprint(\"\\nGenerating report summaries...\")\nreport_summaries = generate_report_summaries(all_parsed_results)\n\nprint(\"\\nProcessing complete. Results saved to:\")\nprint(f\"- All parsed results: {parsed_dir / 'all_parsed_results.json'}\")\nprint(f\"- Enriched reports with original context: {parsed_dir / 'enriched_reports.json'}\")\n\ntotal_queries = len(all_parsed_results)\ntotal_results = sum(len(query[\"parsed_results\"]) for query in all_parsed_results)\ntotal_reports = len(report_summaries)\n\nprint(f\"\\nSummary Statistics:\")\nprint(f\"- Total Reports: {total_reports}\")\nprint(f\"- Total Queries: {total_queries}\")\nprint(f\"- Total Results Parsed: {total_results}\")\n\n# Calculate statistics only on high-relevance results\nMIN_RELEVANCE_SCORE = 6\nhigh_relevance_results = [\n result\n for query in all_parsed_results \n for result in query[\"parsed_results\"]\n if result.get(\"relevance_score\", 0) >= MIN_RELEVANCE_SCORE\n]\n\ntotal_high_relevance = len(high_relevance_results)\nprint(f\"- High Relevance Results (score >= {MIN_RELEVANCE_SCORE}): {total_high_relevance}\")\n\nif high_relevance_results:\n avg_relevance = sum(result.get(\"relevance_score\", 0) for result in high_relevance_results) / total_high_relevance\n print(f\"- Average Relevance Score (high relevance only): {avg_relevance:.2f}/10\")\n\n# Display a sample of one report's structure (to verify format)\nif report_summaries:\n sample_report_index = list(report_summaries.keys())[0]\n sample_report = report_summaries[sample_report_index]\n print(f\"\\nSample Structure for Report '{sample_report['report_title']}':\")\n print(\"- Original metadata included\")\n print(\"- Queries with filtered high-relevance results\")\n print(\"- Generated summary included\")\n print(\"Ready for the next step in the workflow!\")" }, { "cell_type": "code", "execution_count": null, "id": "3fca3762-2400-4729-b1e5-4e8c069ecf01", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }