Explorar o código

Create Step-3-Download-Info.ipynb

Sanyam Bhutani hai 3 meses
pai
achega
8a174783ee
Modificáronse 1 ficheiros con 686 adicións e 0 borrados
  1. 686 0
      end-to-end-use-cases/researcher/Step-3-Download-Info.ipynb

+ 686 - 0
end-to-end-use-cases/researcher/Step-3-Download-Info.ipynb

@@ -0,0 +1,686 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install google-search-results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "import time\n",
+    "from serpapi import GoogleSearch\n",
+    "import requests\n",
+    "import hashlib\n",
+    "from pathlib import Path\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_dir = Path(\"llama_data\")\n",
+    "src_dir = base_dir / \"src\"\n",
+    "results_dir = base_dir / \"results\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_dir.mkdir(exist_ok=True)\n",
+    "src_dir.mkdir(exist_ok=True)\n",
+    "results_dir.mkdir(exist_ok=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('generated_outlines.json', 'r') as file:\n",
+    "    content = file.read()\n",
+    "    data = json.loads(content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 5 report outlines\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Loaded {len(data)} report outlines\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Sample report title: Llama 3.3: A Revolutionary Leap in AI\n",
+      "Sample queries:\n",
+      "- Llama 3.3 new features and enhancements: To gather information on the new features and enhancements in Llama 3.3\n",
+      "- Llama 3.3 vs Llama 3.1 performance comparison: To gather information on the performance comparison between Llama 3.3 and Llama 3.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\nSample report title:\", data[0].get('original_goal', {}).get('Report Title', 'No title'))\n",
+    "print(\"Sample queries:\")\n",
+    "for query in data[0].get('Web Queries', [])[:2]:\n",
+    "    print(f\"- {query.get('query')}: {query.get('purpose')}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_queries = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for report_index, report_data in enumerate(data):\n",
+    "    report_title = report_data.get('original_goal', {}).get('Report Title', f\"Report {report_index}\")\n",
+    "    \n",
+    "    for query_index, query_data in enumerate(report_data.get('Web Queries', [])):\n",
+    "        query = query_data.get('query', '')\n",
+    "        purpose = query_data.get('purpose', '')\n",
+    "        \n",
+    "        all_queries.append({\n",
+    "            'report_index': report_index,\n",
+    "            'report_title': report_title,\n",
+    "            'query_index': query_index,\n",
+    "            'query': query,\n",
+    "            'purpose': purpose\n",
+    "        })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total queries extracted: 15\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>report_index</th>\n",
+       "      <th>report_title</th>\n",
+       "      <th>query_index</th>\n",
+       "      <th>query</th>\n",
+       "      <th>purpose</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Llama 3.3: A Revolutionary Leap in AI</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Llama 3.3 new features and enhancements</td>\n",
+       "      <td>To gather information on the new features and ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Llama 3.3: A Revolutionary Leap in AI</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Llama 3.3 vs Llama 3.1 performance comparison</td>\n",
+       "      <td>To gather information on the performance compa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Llama 3.3: A Revolutionary Leap in AI</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Cost of running Llama 3.3 on cloud vs local in...</td>\n",
+       "      <td>To gather information on the cost-effectivenes...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Llama 3.3 vs Llama 3.1: A Comparative Analysis</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Llama 3.3 new features and improvements</td>\n",
+       "      <td>To gather information on new features and impr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Llama 3.3 vs Llama 3.1: A Comparative Analysis</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Llama 3.1 vs Llama 3.3 performance comparison</td>\n",
+       "      <td>To gather information on performance differenc...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   report_index                                    report_title  query_index  \\\n",
+       "0             0           Llama 3.3: A Revolutionary Leap in AI            0   \n",
+       "1             0           Llama 3.3: A Revolutionary Leap in AI            1   \n",
+       "2             0           Llama 3.3: A Revolutionary Leap in AI            2   \n",
+       "3             1  Llama 3.3 vs Llama 3.1: A Comparative Analysis            0   \n",
+       "4             1  Llama 3.3 vs Llama 3.1: A Comparative Analysis            1   \n",
+       "\n",
+       "                                               query  \\\n",
+       "0            Llama 3.3 new features and enhancements   \n",
+       "1      Llama 3.3 vs Llama 3.1 performance comparison   \n",
+       "2  Cost of running Llama 3.3 on cloud vs local in...   \n",
+       "3            Llama 3.3 new features and improvements   \n",
+       "4      Llama 3.1 vs Llama 3.3 performance comparison   \n",
+       "\n",
+       "                                             purpose  \n",
+       "0  To gather information on the new features and ...  \n",
+       "1  To gather information on the performance compa...  \n",
+       "2  To gather information on the cost-effectivenes...  \n",
+       "3  To gather information on new features and impr...  \n",
+       "4  To gather information on performance differenc...  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "queries_df = pd.DataFrame(all_queries)\n",
+    "print(f\"Total queries extracted: {len(queries_df)}\")\n",
+    "queries_df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "''"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "SERPAPI_KEY = \"\"\n",
+    "SERPAPI_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def search_with_serpapi(query, num_results=5):\n",
+    "    print(f\"Searching for: {query}\")\n",
+    "    \n",
+    "    params = {\n",
+    "        \"engine\": \"google\",\n",
+    "        \"q\": query,\n",
+    "        \"api_key\": SERPAPI_KEY,\n",
+    "        \"num\": num_results,\n",
+    "    }\n",
+    "    \n",
+    "    search = GoogleSearch(params)\n",
+    "    results = search.get_dict()\n",
+    "    \n",
+    "    # Check if we have organic results\n",
+    "    if \"organic_results\" not in results:\n",
+    "        print(f\"Warning: No organic results found for query: {query}\")\n",
+    "        return []\n",
+    "    \n",
+    "    return results[\"organic_results\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_html(url):\n",
+    "    try:\n",
+    "        headers = {\n",
+    "            \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\"\n",
+    "        }\n",
+    "        response = requests.get(url, headers=headers, timeout=10)\n",
+    "        response.raise_for_status()\n",
+    "        return response.text\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error fetching HTML from {url}: {str(e)}\")\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_html(html_content, report_index, report_title, query_index, query, result_index, title, url):\n",
+    "    if html_content is None:\n",
+    "        return None\n",
+    "    \n",
+    "    sanitized_report = report_title.replace(\" \", \"_\").replace(\":\", \"\").replace(\"/\", \"\")[:30]\n",
+    "    sanitized_query = query.replace(\" \", \"_\").replace(\":\", \"\").replace(\"/\", \"\")[:30]\n",
+    "    \n",
+    "    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]\n",
+    "\n",
+    "    report_dir = results_dir / f\"report_{report_index}_{sanitized_report}\"\n",
+    "    report_dir.mkdir(exist_ok=True)\n",
+    "    \n",
+    "    query_dir = report_dir / f\"query_{query_index}_{sanitized_query}\"\n",
+    "    query_dir.mkdir(exist_ok=True)\n",
+    "    \n",
+    "    sanitized_title = ''.join(c if c.isalnum() or c in ['_', '-'] else '_' for c in title)[:30]\n",
+    "    filename = f\"result_{result_index}_{url_hash}_{sanitized_title}.html\"\n",
+    "    filepath = query_dir / filename\n",
+    "\n",
+    "    with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
+    "        f.write(html_content)\n",
+    "    \n",
+    "    metadata = {\n",
+    "        \"report_index\": report_index,\n",
+    "        \"report_title\": report_title,\n",
+    "        \"query_index\": query_index,\n",
+    "        \"query\": query,\n",
+    "        \"result_index\": result_index,\n",
+    "        \"title\": title,\n",
+    "        \"url\": url,\n",
+    "        \"timestamp\": time.strftime(\"%Y-%m-%d %H:%M:%S\")\n",
+    "    }\n",
+    "    \n",
+    "    metadata_path = query_dir / f\"result_{result_index}_{url_hash}_metadata.json\"\n",
+    "    with open(metadata_path, \"w\") as f:\n",
+    "        json.dump(metadata, f, indent=2)\n",
+    "    \n",
+    "    return str(filepath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_all_queries(queries_df):\n",
+    "    results = []\n",
+    "    \n",
+    "    for index, row in queries_df.iterrows():\n",
+    "        print(f\"\\nProcessing query {index + 1}/{len(queries_df)}\")\n",
+    "        print(f\"Report: {row['report_title']}\")\n",
+    "        print(f\"Query: {row['query']}\")\n",
+    "        \n",
+    "        search_results = search_with_serpapi(row['query'])\n",
+    "        \n",
+    "        query_results = []\n",
+    "        for result_index, result in enumerate(search_results):\n",
+    "            title = result.get('title', 'No Title')\n",
+    "            url = result.get('link', '')\n",
+    "            snippet = result.get('snippet', '')\n",
+    "            \n",
+    "            print(f\"  Result {result_index + 1}: {title[:50]}...\")\n",
+    "            \n",
+    "            html_content = fetch_html(url)\n",
+    "            filepath = save_html(\n",
+    "                html_content, \n",
+    "                row['report_index'], \n",
+    "                row['report_title'],\n",
+    "                row['query_index'], \n",
+    "                row['query'], \n",
+    "                result_index, \n",
+    "                title, \n",
+    "                url\n",
+    "            )\n",
+    "            \n",
+    "            result_info = {\n",
+    "                \"result_index\": result_index,\n",
+    "                \"title\": title,\n",
+    "                \"url\": url,\n",
+    "                \"snippet\": snippet,\n",
+    "                \"filepath\": filepath\n",
+    "            }\n",
+    "            \n",
+    "            query_results.append(result_info)\n",
+    "            \n",
+    "            # Timeout\n",
+    "            time.sleep(1)\n",
+    "        \n",
+    "        query_result = {\n",
+    "            \"report_index\": row['report_index'],\n",
+    "            \"report_title\": row['report_title'],\n",
+    "            \"query_index\": row['query_index'],\n",
+    "            \"query\": row['query'],\n",
+    "            \"purpose\": row['purpose'],\n",
+    "            \"results\": query_results\n",
+    "        }\n",
+    "        \n",
+    "        results.append(query_result)\n",
+    "        \n",
+    "        with open(base_dir / \"results_so_far.json\", \"w\") as f:\n",
+    "            json.dump(results, f, indent=2)\n",
+    "    \n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Processing query 1/15\n",
+      "Report: Llama 3.3: A Revolutionary Leap in AI\n",
+      "Query: Llama 3.3 new features and enhancements\n",
+      "Searching for: Llama 3.3 new features and enhancements\n",
+      "  Result 1: Introducing the new Llama 3.3: Features and Overvi...\n",
+      "  Result 2: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n",
+      "  Result 3: Key Features and Improvements in LLaMA 3.3...\n",
+      "  Result 4: Everything You Need to Know About Llama 3.3 | by A...\n",
+      "\n",
+      "Processing query 2/15\n",
+      "Report: Llama 3.3: A Revolutionary Leap in AI\n",
+      "Query: Llama 3.3 vs Llama 3.1 performance comparison\n",
+      "Searching for: Llama 3.3 vs Llama 3.1 performance comparison\n",
+      "  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n",
+      "  Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n",
+      "  Result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...\n",
+      "  Result 4: Llama 3.3 just dropped — is it better than GPT-4 o...\n",
+      "  Result 5: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n",
+      "\n",
+      "Processing query 3/15\n",
+      "Report: Llama 3.3: A Revolutionary Leap in AI\n",
+      "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "  Result 1: What's the cost of running Llama3:8b & 70b in the ...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n",
+      "  Result 4: Llama 3.3 API Pricing: What You Need to Know...\n",
+      "  Result 5: Llama models | Generative AI...\n",
+      "\n",
+      "Processing query 4/15\n",
+      "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n",
+      "Query: Llama 3.3 new features and improvements\n",
+      "Searching for: Llama 3.3 new features and improvements\n",
+      "  Result 1: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Efficient, Accessible Generative AI on CPU with Ne...\n",
+      "  Result 4: Meta Releases Llama 3.3: a Model with Enhanced Per...\n",
+      "\n",
+      "Processing query 5/15\n",
+      "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n",
+      "Query: Llama 3.1 vs Llama 3.3 performance comparison\n",
+      "Searching for: Llama 3.1 vs Llama 3.3 performance comparison\n",
+      "  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n",
+      "  Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n",
+      "  Result 3: Llama 3.3 just dropped — is it better than GPT-4 o...\n",
+      "  Result 4: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n",
+      "\n",
+      "Processing query 6/15\n",
+      "Report: Llama 3.3 vs Llama 3.1: A Comparative Analysis\n",
+      "Query: Cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure\n",
+      "Searching for: Cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure\n",
+      "  Result 1: What's the cost of running Llama3:8b & 70b in the ...\n",
+      "  Result 2: The Million-Dollar Trick: LLAMA 3.1 is Free to Own...\n",
+      "  Result 3: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n",
+      "  Result 4: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 5: Llama models | Generative AI...\n",
+      "\n",
+      "Processing query 7/15\n",
+      "Report: The Cost-Benefit Analysis of Llama 3.3\n",
+      "Query: Llama 3.3 new features and improvements\n",
+      "Searching for: Llama 3.3 new features and improvements\n",
+      "  Result 1: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Efficient, Accessible Generative AI on CPU with Ne...\n",
+      "  Result 4: Meta Releases Llama 3.3: a Model with Enhanced Per...\n",
+      "\n",
+      "Processing query 8/15\n",
+      "Report: The Cost-Benefit Analysis of Llama 3.3\n",
+      "Query: Cost of running Llama 3.3 on cloud vs local\n",
+      "Searching for: Cost of running Llama 3.3 on cloud vs local\n",
+      "  Result 1: Costs to run Llama 3.3 on cloud? : r/LocalLLaMA...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Llama models | Generative AI...\n",
+      "  Result 4: Meta Llama in the Cloud | Llama Everywhere...\n",
+      "Error fetching HTML from https://www.llama.com/docs/llama-everywhere/running-meta-llama-in-the-cloud/: 400 Client Error: Bad Request for url: https://www.llama.com/docs/llama-everywhere/running-meta-llama-in-the-cloud/\n",
+      "  Result 5: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n",
+      "\n",
+      "Processing query 9/15\n",
+      "Report: The Cost-Benefit Analysis of Llama 3.3\n",
+      "Query: Llama 3.3 vs Llama 3.1 performance comparison\n",
+      "Searching for: Llama 3.3 vs Llama 3.1 performance comparison\n",
+      "  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n",
+      "  Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n",
+      "  Result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...\n",
+      "  Result 4: Llama 3.3 just dropped — is it better than GPT-4 o...\n",
+      "  Result 5: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n",
+      "\n",
+      "Processing query 10/15\n",
+      "Report: Llama 3.3: The Future of AI-Driven Innovation\n",
+      "Query: Llama 3.3 new features and enhancements\n",
+      "Searching for: Llama 3.3 new features and enhancements\n",
+      "  Result 1: Introducing the new Llama 3.3: Features and Overvi...\n",
+      "  Result 2: What is Meta Llama 3.3 70B? Features, Use Cases & ...\n",
+      "  Result 3: Key Features and Improvements in LLaMA 3.3...\n",
+      "  Result 4: Everything You Need to Know About Llama 3.3 | by A...\n",
+      "\n",
+      "Processing query 11/15\n",
+      "Report: Llama 3.3: The Future of AI-Driven Innovation\n",
+      "Query: Llama 3.3 vs Llama 3.1 comparison\n",
+      "Searching for: Llama 3.3 vs Llama 3.1 comparison\n",
+      "  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n",
+      "  Result 2: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n",
+      "  Result 3: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n",
+      "  Result 4: Llama 3.1 vs Llama 3 Differences - GoPenAI...\n",
+      "  Result 5: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n",
+      "\n",
+      "Processing query 12/15\n",
+      "Report: Llama 3.3: The Future of AI-Driven Innovation\n",
+      "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "  Result 1: What's the cost of running Llama3:8b & 70b in the ...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n",
+      "  Result 4: Llama 3.3 API Pricing: What You Need to Know...\n",
+      "  Result 5: Llama models | Generative AI...\n",
+      "\n",
+      "Processing query 13/15\n",
+      "Report: Llama 3.3: A Technical Deep Dive\n",
+      "Query: Llama 3.3 architecture and technical specifications\n",
+      "Searching for: Llama 3.3 architecture and technical specifications\n",
+      "  Result 1: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 2: Introducing Meta Llama 3: The most capable openly ...\n",
+      "Error fetching HTML from https://ai.meta.com/blog/meta-llama-3/: 400 Client Error: Bad Request for url: https://ai.meta.com/blog/meta-llama-3/\n",
+      "  Result 3: meta-llama/Llama-3.3-70B-Instruct...\n",
+      "  Result 4: llama-3.3-70b-instruct Model by Meta...\n",
+      "  Result 5: Llama-3.3-70B - Documentation & FAQ...\n",
+      "\n",
+      "Processing query 14/15\n",
+      "Report: Llama 3.3: A Technical Deep Dive\n",
+      "Query: Llama 3.3 vs Llama 3.1 comparison\n",
+      "Searching for: Llama 3.3 vs Llama 3.1 comparison\n",
+      "  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...\n",
+      "  Result 2: Llama 3 vs Llama 3.1 : Which is Better for Your AI...\n",
+      "  Result 3: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...\n",
+      "  Result 4: Llama 3.1 vs Llama 3 Differences - GoPenAI...\n",
+      "  Result 5: Decoding Llama 3 vs 3.1: Which One Is Right for Yo...\n",
+      "\n",
+      "Processing query 15/15\n",
+      "Report: Llama 3.3: A Technical Deep Dive\n",
+      "Query: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "Searching for: Cost of running Llama 3.3 on cloud vs local infrastructure\n",
+      "  Result 1: What's the cost of running Llama3:8b & 70b in the ...\n",
+      "  Result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...\n",
+      "Error fetching HTML from https://www.datacamp.com/blog/llama-3-3-70b: 403 Client Error: Forbidden for url: https://www.datacamp.com/blog/llama-3-3-70b\n",
+      "  Result 3: Llama 3.3 vs. ChatGPT Pro: Key Considerations...\n",
+      "  Result 4: Llama 3.3 API Pricing: What You Need to Know...\n",
+      "  Result 5: Llama models | Generative AI...\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = process_all_queries(queries_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyze_results():\n",
+    "\n",
+    "    try:\n",
+    "        with open(base_dir / \"results_so_far.json\", \"r\") as f:\n",
+    "            results = json.load(f)\n",
+    "        \n",
+    "        total_results = sum(len(query[\"results\"]) for query in results)\n",
+    "        print(f\"Total queries processed: {len(results)}\")\n",
+    "        print(f\"Total search results fetched: {total_results}\")\n",
+    "        \n",
+    "        summary_data = []\n",
+    "        for query in results:\n",
+    "            report_title = query[\"report_title\"]\n",
+    "            query_text = query[\"query\"]\n",
+    "            results_count = len(query[\"results\"])\n",
+    "            \n",
+    "            summary_data.append({\n",
+    "                \"Report\": report_title,\n",
+    "                \"Query\": query_text,\n",
+    "                \"Results Count\": results_count\n",
+    "            })\n",
+    "        \n",
+    "        summary_df = pd.DataFrame(summary_data)\n",
+    "        return summary_df\n",
+    "    except FileNotFoundError:\n",
+    "        print(\"No results file found. Run the processing first.\")\n",
+    "        return None\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total queries processed: 15\n",
+      "Total search results fetched: 70\n"
+     ]
+    }
+   ],
+   "source": [
+    "summary_df = analyze_results()\n",
+    "# if summary_df is not None:\n",
+    "#     summary_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}