{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#!pip install google-search-results" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import time\n", "from serpapi import GoogleSearch\n", "import requests\n", "import hashlib\n", "from pathlib import Path\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "base_dir = Path(\"llama_data\")\n", "src_dir = base_dir / \"src\"\n", "results_dir = base_dir / \"results\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "base_dir.mkdir(exist_ok=True)\n", "src_dir.mkdir(exist_ok=True)\n", "results_dir.mkdir(exist_ok=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('generated_outlines.json', 'r') as file:\n", " content = file.read()\n", " data = json.loads(content)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 5 report outlines\n" ] } ], "source": [ "print(f\"Loaded {len(data)} report outlines\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Sample report title: Llama 3.3: A Revolutionary Leap in AI\n", "Sample queries:\n", "- Llama 3.3 new features and enhancements: To gather information on the new features and enhancements in Llama 3.3\n", "- Llama 3.3 vs Llama 3.1 performance comparison: To gather information on the performance comparison between Llama 3.3 and Llama 3.1\n" ] } ], "source": [ "print(\"\\nSample report title:\", data[0].get('original_goal', {}).get('Report Title', 'No title'))\n", "print(\"Sample queries:\")\n", "for query in data[0].get('Web Queries', [])[:2]:\n", " print(f\"- {query.get('query')}: {query.get('purpose')}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_queries = []" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for report_index, report_data in enumerate(data):\n", " report_title = report_data.get('original_goal', {}).get('Report Title', f\"Report {report_index}\")\n", " \n", " for query_index, query_data in enumerate(report_data.get('Web Queries', [])):\n", " query = query_data.get('query', '')\n", " purpose = query_data.get('purpose', '')\n", " \n", " all_queries.append({\n", " 'report_index': report_index,\n", " 'report_title': report_title,\n", " 'query_index': query_index,\n", " 'query': query,\n", " 'purpose': purpose\n", " })" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total queries extracted: 15\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
report_indexreport_titlequery_indexquerypurpose
00Llama 3.3: A Revolutionary Leap in AI0Llama 3.3 new features and enhancementsTo gather information on the new features and ...
10Llama 3.3: A Revolutionary Leap in AI1Llama 3.3 vs Llama 3.1 performance comparisonTo gather information on the performance compa...
20Llama 3.3: A Revolutionary Leap in AI2Cost of running Llama 3.3 on cloud vs local in...To gather information on the cost-effectivenes...
31Llama 3.3 vs Llama 3.1: A Comparative Analysis0Llama 3.3 new features and improvementsTo gather information on new features and impr...
41Llama 3.3 vs Llama 3.1: A Comparative Analysis1Llama 3.1 vs Llama 3.3 performance comparisonTo gather information on performance differenc...
\n", "
" ], "text/plain": [ " report_index report_title query_index \\\n", "0 0 Llama 3.3: A Revolutionary Leap in AI 0 \n", "1 0 Llama 3.3: A Revolutionary Leap in AI 1 \n", "2 0 Llama 3.3: A Revolutionary Leap in AI 2 \n", "3 1 Llama 3.3 vs Llama 3.1: A Comparative Analysis 0 \n", "4 1 Llama 3.3 vs Llama 3.1: A Comparative Analysis 1 \n", "\n", " query \\\n", "0 Llama 3.3 new features and enhancements \n", "1 Llama 3.3 vs Llama 3.1 performance comparison \n", "2 Cost of running Llama 3.3 on cloud vs local in... \n", "3 Llama 3.3 new features and improvements \n", "4 Llama 3.1 vs Llama 3.3 performance comparison \n", "\n", " purpose \n", "0 To gather information on the new features and ... \n", "1 To gather information on the performance compa... \n", "2 To gather information on the cost-effectivenes... \n", "3 To gather information on new features and impr... \n", "4 To gather information on performance differenc... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "queries_df = pd.DataFrame(all_queries)\n", "print(f\"Total queries extracted: {len(queries_df)}\")\n", "queries_df.head()\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "SERPAPI_KEY = \"\"\n", "SERPAPI_KEY" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def search_with_serpapi(query, num_results=5):\n", " print(f\"Searching for: {query}\")\n", " \n", " params = {\n", " \"engine\": \"google\",\n", " \"q\": query,\n", " \"api_key\": SERPAPI_KEY,\n", " \"num\": num_results,\n", " }\n", " \n", " search = GoogleSearch(params)\n", " results = search.get_dict()\n", " \n", " # Check if we have organic results\n", " if \"organic_results\" not in results:\n", " print(f\"Warning: No organic results found for query: {query}\")\n", " return []\n", " \n", " return results[\"organic_results\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fetch_html(url):\n", " try:\n", " headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\"\n", " }\n", " response = requests.get(url, headers=headers, timeout=10)\n", " response.raise_for_status()\n", " return response.text\n", " except Exception as e:\n", " print(f\"Error fetching HTML from {url}: {str(e)}\")\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def save_html(html_content, report_index, report_title, query_index, query, result_index, title, url):\n", " if html_content is None:\n", " return None\n", " \n", " sanitized_report = report_title.replace(\" \", \"_\").replace(\":\", \"\").replace(\"/\", \"\")[:30]\n", " sanitized_query = query.replace(\" \", \"_\").replace(\":\", \"\").replace(\"/\", \"\")[:30]\n", " \n", " url_hash = hashlib.md5(url.encode()).hexdigest()[:8]\n", "\n", " report_dir = results_dir / f\"report_{report_index}_{sanitized_report}\"\n", " report_dir.mkdir(exist_ok=True)\n", " \n", " query_dir = report_dir / f\"query_{query_index}_{sanitized_query}\"\n", " query_dir.mkdir(exist_ok=True)\n", " \n", " sanitized_title = ''.join(c if c.isalnum() or c in ['_', '-'] else '_' for c in title)[:30]\n", " filename = f\"result_{result_index}_{url_hash}_{sanitized_title}.html\"\n", " filepath = query_dir / filename\n", "\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(html_content)\n", " \n", " metadata = {\n", " \"report_index\": report_index,\n", " \"report_title\": report_title,\n", " \"query_index\": query_index,\n", " \"query\": query,\n", " \"result_index\": result_index,\n", " \"title\": title,\n", " \"url\": url,\n", " \"timestamp\": time.strftime(\"%Y-%m-%d %H:%M:%S\")\n", " }\n", " \n", " metadata_path = query_dir / f\"result_{result_index}_{url_hash}_metadata.json\"\n", " with open(metadata_path, \"w\") as f:\n", " json.dump(metadata, f, indent=2)\n", " \n", " return str(filepath)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_all_queries(queries_df):\n", " results = []\n", " \n", " for index, row in queries_df.iterrows():\n", " print(f\"\\nProcessing query {index + 1}/{len(queries_df)}\")\n", " print(f\"Report: {row['report_title']}\")\n", " print(f\"Query: {row['query']}\")\n", " \n", " search_results = search_with_serpapi(row['query'])\n", " \n", " query_results = []\n", " for result_index, result in enumerate(search_results):\n", " title = result.get('title', 'No Title')\n", " url = result.get('link', '')\n", " snippet = result.get('snippet', '')\n", " \n", " print(f\" Result {result_index + 1}: {title[:50]}...\")\n", " \n", " html_content = fetch_html(url)\n", " filepath = save_html(\n", " html_content, \n", " row['report_index'], \n", " row['report_title'],\n", " row['query_index'], \n", " row['query'], \n", " result_index, \n", " title, \n", " url\n", " )\n", " \n", " result_info = {\n", " \"result_index\": result_index,\n", " \"title\": title,\n", " \"url\": url,\n", " \"snippet\": snippet,\n", " \"filepath\": filepath\n", " }\n", " \n", " query_results.append(result_info)\n", " \n", " # Timeout\n", " time.sleep(1)\n", " \n", " query_result = {\n", " \"report_index\": row['report_index'],\n", " \"report_title\": row['report_title'],\n", " \"query_index\": row['query_index'],\n", " \"query\": row['query'],\n", " \"purpose\": row['purpose'],\n", " \"results\": query_results\n", " }\n", " \n", " results.append(query_result)\n", " \n", " with open(base_dir / \"results_so_far.json\", \"w\") as f:\n", " json.dump(results, f, indent=2)\n", " \n", " return results\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Processing query 1/15\n", "Report: Llama 3.3: A Revolutionary Leap in AI\n", "Query: Llama 3.3 new features and enhancements\n", "Searching for: Llama 3.3 new features and enhancements\n", " Result 1: Introducing the new Llama 3.3: Features and Overvi...\n", " Result 2: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n", " Result 3: Key Features and Improvements in LLaMA 3.3...\n", " Result 4: Everything You Need to Know About Llama 3.3 | by A...\n", "\n", "Processing query 2/15\n", "Report: Llama 3.3: A Revolutionary Leap in AI\n", "Query: Llama 3.3 vs Llama 3.1 performance comparison\n", "Searching for: Llama 3.3 vs Llama 3.1 performance comparison\n", " Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n", " Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n", " Result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...\n", " Result 4: Llama 3.3 just dropped — is it better than GPT-4 o...\n", " Result 5: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n", "\n", "Processing query 3/15\n", "Report: Llama 3.3: A Revolutionary Leap in AI\n", "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n", "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n", " Result 1: What's the cost of running Llama3:8b & 70b in the ...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n", " Result 4: Llama 3.3 API Pricing: What You Need to Know...\n", " Result 5: Llama models | Generative AI...\n", "\n", "Processing query 4/15\n", "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n", "Query: Llama 3.3 new features and improvements\n", "Searching for: Llama 3.3 new features and improvements\n", " Result 1: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Efficient, Accessible Generative AI on CPU with Ne...\n", " Result 4: Meta Releases Llama 3.3: a Model with Enhanced Per...\n", "\n", "Processing query 5/15\n", "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n", "Query: Llama 3.1 vs Llama 3.3 performance comparison\n", "Searching for: Llama 3.1 vs Llama 3.3 performance comparison\n", " Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n", " Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n", " Result 3: Llama 3.3 just dropped — is it better than GPT-4 o...\n", " Result 4: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n", "\n", "Processing query 6/15\n", "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n", "Query: Cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure\n", "Searching for: Cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure\n", " Result 1: What's the cost of running Llama3:8b & 70b in the ...\n", " Result 2: The Million-Dollar Trick: LLAMA 3.1 is Free to Own...\n", " Result 3: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n", " Result 4: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 5: Llama models | Generative AI...\n", "\n", "Processing query 7/15\n", "Report: The Cost-Benefit Analysis of Llama 3.3\n", "Query: Llama 3.3 new features and improvements\n", "Searching for: Llama 3.3 new features and improvements\n", " Result 1: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Efficient, Accessible Generative AI on CPU with Ne...\n", " Result 4: Meta Releases Llama 3.3: a Model with Enhanced Per...\n", "\n", "Processing query 8/15\n", "Report: The Cost-Benefit Analysis of Llama 3.3\n", "Query: Cost of running Llama 3.3 on cloud vs local\n", "Searching for: Cost of running Llama 3.3 on cloud vs local\n", " Result 1: Costs to run Llama 3.3 on cloud? : r/LocalLLaMA...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Llama models | Generative AI...\n", " Result 4: Meta Llama in the Cloud | Llama Everywhere...\n", "Error fetching HTML from https://www.llama.com/docs/llama-everywhere/running-meta-llama-in-the-cloud/: 400 Client Error: Bad Request for url: https://www.llama.com/docs/llama-everywhere/running-meta-llama-in-the-cloud/\n", " Result 5: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n", "\n", "Processing query 9/15\n", "Report: The Cost-Benefit Analysis of Llama 3.3\n", "Query: Llama 3.3 vs Llama 3.1 performance comparison\n", "Searching for: Llama 3.3 vs Llama 3.1 performance comparison\n", " Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n", " Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n", " Result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...\n", " Result 4: Llama 3.3 just dropped — is it better than GPT-4 o...\n", " Result 5: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n", "\n", "Processing query 10/15\n", "Report: Llama 3.3: The Future of AI-Driven Innovation\n", "Query: Llama 3.3 new features and enhancements\n", "Searching for: Llama 3.3 new features and enhancements\n", " Result 1: Introducing the new Llama 3.3: Features and Overvi...\n", " Result 2: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n", " Result 3: Key Features and Improvements in LLaMA 3.3...\n", " Result 4: Everything You Need to Know About Llama 3.3 | by A...\n", "\n", "Processing query 11/15\n", "Report: Llama 3.3: The Future of AI-Driven Innovation\n", "Query: Llama 3.3 vs Llama 3.1 comparison\n", "Searching for: Llama 3.3 vs Llama 3.1 comparison\n", " Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n", " Result 2: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n", " Result 3: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n", " Result 4: Llama 3.1 vs Llama 3 Differences - GoPenAI...\n", " Result 5: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n", "\n", "Processing query 12/15\n", "Report: Llama 3.3: The Future of AI-Driven Innovation\n", "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n", "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n", " Result 1: What's the cost of running Llama3:8b & 70b in the ...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n", " Result 4: Llama 3.3 API Pricing: What You Need to Know...\n", " Result 5: Llama models | Generative AI...\n", "\n", "Processing query 13/15\n", "Report: Llama 3.3: A Technical Deep Dive\n", "Query: Llama 3.3 architecture and technical specifications\n", "Searching for: Llama 3.3 architecture and technical specifications\n", " Result 1: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 2: Introducing Meta Llama 3: The most capable openly ...\n", "Error fetching HTML from https://ai.meta.com/blog/meta-llama-3/: 400 Client Error: Bad Request for url: https://ai.meta.com/blog/meta-llama-3/\n", " Result 3: meta-llama/Llama-3.3-70B-Instruct...\n", " Result 4: llama-3.3-70b-instruct Model by Meta...\n", " Result 5: Llama-3.3-70B - Documentation & FAQ...\n", "\n", "Processing query 14/15\n", "Report: Llama 3.3: A Technical Deep Dive\n", "Query: Llama 3.3 vs Llama 3.1 comparison\n", "Searching for: Llama 3.3 vs Llama 3.1 comparison\n", " Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n", " Result 2: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n", " Result 3: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n", " Result 4: Llama 3.1 vs Llama 3 Differences - GoPenAI...\n", " Result 5: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n", "\n", "Processing query 15/15\n", "Report: Llama 3.3: A Technical Deep Dive\n", "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n", "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n", " Result 1: What's the cost of running Llama3:8b & 70b in the ...\n", " Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n", "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n", " Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n", " Result 4: Llama 3.3 API Pricing: What You Need to Know...\n", " Result 5: Llama models | Generative AI...\n" ] } ], "source": [ "results = process_all_queries(queries_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def analyze_results():\n", "\n", " try:\n", " with open(base_dir / \"results_so_far.json\", \"r\") as f:\n", " results = json.load(f)\n", " \n", " total_results = sum(len(query[\"results\"]) for query in results)\n", " print(f\"Total queries processed: {len(results)}\")\n", " print(f\"Total search results fetched: {total_results}\")\n", " \n", " summary_data = []\n", " for query in results:\n", " report_title = query[\"report_title\"]\n", " query_text = query[\"query\"]\n", " results_count = len(query[\"results\"])\n", " \n", " summary_data.append({\n", " \"Report\": report_title,\n", " \"Query\": query_text,\n", " \"Results Count\": results_count\n", " })\n", " \n", " summary_df = pd.DataFrame(summary_data)\n", " return summary_df\n", " except FileNotFoundError:\n", " print(\"No results file found. Run the processing first.\")\n", " return None\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total queries processed: 15\n", "Total search results fetched: 70\n" ] } ], "source": [ "summary_df = analyze_results()\n", "# if summary_df is not None:\n", "# summary_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }