Quellcode durchsuchen

updated recipe to use llama 4 (#922)

varunfb vor 1 Monat
Ursprung
Commit
62c49aa923
1 geänderte Dateien mit 26 neuen und 23 gelöschten Zeilen
  1. 26 23
      end-to-end-use-cases/browser_use/agent/browser-use-quickstart.ipynb

+ 26 - 23
end-to-end-use-cases/browser_use/agent/browser-use-quickstart.ipynb

@@ -4,9 +4,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 🌐 Building an Intelligent Browser Agent with Llama 3.2\n",
+    "# 🌐 Building an Intelligent Browser Agent with Llama 4 Scout\n",
     "\n",
-    "This notebook provides a step-by-step guide to creating an AI-powered browser agent capable of navigating and interacting with websites autonomously. By combining the power of Llama 3.2 Vision, Playwright, and Together AI, this agent can perform tasks seamlessly while understanding both visual and textual content.\n",
+    "This notebook provides a step-by-step guide to creating an AI-powered browser agent capable of navigating and interacting with websites autonomously. By combining the power of Llama 4 Scout, Playwright, and Together AI, this agent can perform tasks seamlessly while understanding both visual and textual content.\n",
     "\n",
     "##### Demo\n",
     "For a detailed explanation of the code and a demo video, visit our blog post: [**Blog Post and Demo Video**](https://miguelg719.github.io/browser-use-blog/)\n",
@@ -29,7 +29,7 @@
     "- Setting up the environment and installing dependencies.\n",
     "- Automating browser interactions using Playwright.\n",
     "- Defining a structured prompt for the LLM to understand the task and execute the next action.\n",
-    "- Leveraging Llama 3.2 Vision for content comprehension.\n",
+    "- Leveraging Llama 4 Scout for content comprehension.\n",
     "- Creating a persistent and intelligent browser agent for real-world applications.\n",
     "\n",
     "***Please note that the agent is not perfect and may not always behave as expected.**\n",
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,12 +85,12 @@
     "##### Vision Query Example\n",
     "This function converts an image file into a Base64-encoded string, which is required for LLM querying.\n",
     "\n",
-    "The next cell shows an example of how to use the `encode_image` function to convert an image file into a Base64-encoded string, which is then used in a chat completion request to the Llama 3.2 Vision model.\n"
+    "The next cell shows an example of how to use the `encode_image` function to convert an image file into a Base64-encoded string, which is then used in a chat completion request to the Llama 4 Scout model.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -113,7 +113,7 @@
    "outputs": [],
    "source": [
     "response = client.chat.completions.create(\n",
-    "    model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
+    "    model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
     "    messages=[\n",
     "        {\n",
     "            \"role\": \"user\",\n",
@@ -145,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -197,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -327,7 +327,6 @@
     "       \"reasoning\": \"The accessibility tree shows an input field with placeholder 'Search...'. Entering the query 'AI tools' fulfills the next step of the task.\",\n",
     "       \"action\": \"fill\",\n",
     "       \"selector\": \"placeholder=Search...\",\n",
-    "       \"value\": \"AI tools\"\n",
     "   }\n",
     "\n",
     "3. To navigate to a specific URL:\n",
@@ -358,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -471,7 +470,6 @@
     "    \"summary\": \"Email sent to Dmitry to reschedule the meeting for tomorrow morning.\"\n",
     "}\n",
     "\"\"\"\n",
-    "\n",
     "few_shot_examples = [few_shot_example_1, few_shot_example_2]"
    ]
   },
@@ -486,13 +484,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define your task here:\n",
-    "# task = 'Find toys to buy for my 10 year old niece this Christmas'\n",
-    "# task = 'Find tickets for the next Warriors game'\n",
+    "#task = 'Find toys to buy for my 10 year old niece this Christmas'\n",
+    "#task = 'Find tickets for the next Warriors game'\n",
     "task = 'Find the cheapest flight to Madrid'"
    ]
   },
@@ -513,7 +511,7 @@
    "source": [
     "print(\"Generating plan...\")\n",
     "planning_response = client.chat.completions.create(\n",
-    "    model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
+    "    model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
     "    temperature=0.0,\n",
     "    messages=[\n",
     "        {\"role\": \"system\", \"content\": planning_prompt},\n",
@@ -567,7 +565,7 @@
     "                base64_image = encode_image(imagePath)\n",
     "                previous_context = accessibility_tree\n",
     "                response = client.chat.completions.create(\n",
-    "                    model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
+    "                    model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
     "                    temperature=0.0,\n",
     "                    messages=[\n",
     "                        {\"role\": \"system\", \"content\": execution_prompt},\n",
@@ -590,6 +588,11 @@
     "                    ],\n",
     "                )\n",
     "                res = response.choices[0].message.content\n",
+    "                #added to remove invisible characters, whitespaces and commas:\n",
+    "                # Remove any trailing commas\n",
+    "                res = res.rstrip(',')\n",
+    "                # Remove any invisible characters\n",
+    "                res = ''.join(c for c in res if ord(c) >= 32 or ord(c) == 10 or ord(c) == 13)\n",
     "                print('Agent response:', res)\n",
     "                try:\n",
     "                    match = re.search(r'\\{.*\\}', res, re.DOTALL)\n",
@@ -604,7 +607,7 @@
     "                        previous_actions.append(f\"navigated to {output['url']}, SUCCESS\")\n",
     "                    except Exception as e:\n",
     "                        previous_actions.append(f\"Error navigating to {output['url']}: {e}\")\n",
-    "\n",
+    "    \n",
     "                elif output[\"action\"] == \"click\":\n",
     "                    try:\n",
     "                        selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
@@ -612,7 +615,7 @@
     "                        previous_actions.append(f\"clicked {output['selector']}, SUCCESS\")\n",
     "                    except Exception as e:\n",
     "                        previous_actions.append(f\"Error clicking on {output['selector']}: {e}\")\n",
-    "                        \n",
+    "\n",
     "                elif output[\"action\"] == \"fill\":\n",
     "                    try:\n",
     "                        selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
@@ -666,9 +669,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "browser-use",
    "language": "python",
-   "name": "python3"
+   "name": "browser-use"
   },
   "language_info": {
    "codemirror_mode": {
@@ -680,9 +683,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }