|
@@ -4,9 +4,9 @@
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
- "# 🌐 Building an Intelligent Browser Agent with Llama 3.2\n",
|
|
|
|
|
|
+ "# 🌐 Building an Intelligent Browser Agent with Llama 4 Scout\n",
|
|
"\n",
|
|
"\n",
|
|
- "This notebook provides a step-by-step guide to creating an AI-powered browser agent capable of navigating and interacting with websites autonomously. By combining the power of Llama 3.2 Vision, Playwright, and Together AI, this agent can perform tasks seamlessly while understanding both visual and textual content.\n",
|
|
|
|
|
|
+ "This notebook provides a step-by-step guide to creating an AI-powered browser agent capable of navigating and interacting with websites autonomously. By combining the power of Llama 4 Scout, Playwright, and Together AI, this agent can perform tasks seamlessly while understanding both visual and textual content.\n",
|
|
"\n",
|
|
"\n",
|
|
"##### Demo\n",
|
|
"##### Demo\n",
|
|
"For a detailed explanation of the code and a demo video, visit our blog post: [**Blog Post and Demo Video**](https://miguelg719.github.io/browser-use-blog/)\n",
|
|
"For a detailed explanation of the code and a demo video, visit our blog post: [**Blog Post and Demo Video**](https://miguelg719.github.io/browser-use-blog/)\n",
|
|
@@ -29,7 +29,7 @@
|
|
"- Setting up the environment and installing dependencies.\n",
|
|
"- Setting up the environment and installing dependencies.\n",
|
|
"- Automating browser interactions using Playwright.\n",
|
|
"- Automating browser interactions using Playwright.\n",
|
|
"- Defining a structured prompt for the LLM to understand the task and execute the next action.\n",
|
|
"- Defining a structured prompt for the LLM to understand the task and execute the next action.\n",
|
|
- "- Leveraging Llama 3.2 Vision for content comprehension.\n",
|
|
|
|
|
|
+ "- Leveraging Llama 4 Scout for content comprehension.\n",
|
|
"- Creating a persistent and intelligent browser agent for real-world applications.\n",
|
|
"- Creating a persistent and intelligent browser agent for real-world applications.\n",
|
|
"\n",
|
|
"\n",
|
|
"***Please note that the agent is not perfect and may not always behave as expected.**\n",
|
|
"***Please note that the agent is not perfect and may not always behave as expected.**\n",
|
|
@@ -65,7 +65,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 2,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -85,12 +85,12 @@
|
|
"##### Vision Query Example\n",
|
|
"##### Vision Query Example\n",
|
|
"This function converts an image file into a Base64-encoded string, which is required for LLM querying.\n",
|
|
"This function converts an image file into a Base64-encoded string, which is required for LLM querying.\n",
|
|
"\n",
|
|
"\n",
|
|
- "The next cell shows an example of how to use the `encode_image` function to convert an image file into a Base64-encoded string, which is then used in a chat completion request to the Llama 3.2 Vision model.\n"
|
|
|
|
|
|
+ "The next cell shows an example of how to use the `encode_image` function to convert an image file into a Base64-encoded string, which is then used in a chat completion request to the Llama 4 Scout model.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 3,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -113,7 +113,7 @@
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
"response = client.chat.completions.create(\n",
|
|
"response = client.chat.completions.create(\n",
|
|
- " model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
|
|
|
|
|
|
+ " model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
|
|
" messages=[\n",
|
|
" messages=[\n",
|
|
" {\n",
|
|
" {\n",
|
|
" \"role\": \"user\",\n",
|
|
" \"role\": \"user\",\n",
|
|
@@ -145,7 +145,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 5,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -197,7 +197,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 6,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -327,7 +327,6 @@
|
|
" \"reasoning\": \"The accessibility tree shows an input field with placeholder 'Search...'. Entering the query 'AI tools' fulfills the next step of the task.\",\n",
|
|
" \"reasoning\": \"The accessibility tree shows an input field with placeholder 'Search...'. Entering the query 'AI tools' fulfills the next step of the task.\",\n",
|
|
" \"action\": \"fill\",\n",
|
|
" \"action\": \"fill\",\n",
|
|
" \"selector\": \"placeholder=Search...\",\n",
|
|
" \"selector\": \"placeholder=Search...\",\n",
|
|
- " \"value\": \"AI tools\"\n",
|
|
|
|
" }\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"3. To navigate to a specific URL:\n",
|
|
"3. To navigate to a specific URL:\n",
|
|
@@ -358,7 +357,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 7,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -471,7 +470,6 @@
|
|
" \"summary\": \"Email sent to Dmitry to reschedule the meeting for tomorrow morning.\"\n",
|
|
" \"summary\": \"Email sent to Dmitry to reschedule the meeting for tomorrow morning.\"\n",
|
|
"}\n",
|
|
"}\n",
|
|
"\"\"\"\n",
|
|
"\"\"\"\n",
|
|
- "\n",
|
|
|
|
"few_shot_examples = [few_shot_example_1, few_shot_example_2]"
|
|
"few_shot_examples = [few_shot_example_1, few_shot_example_2]"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -486,13 +484,13 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 8,
|
|
|
|
|
|
+ "execution_count": null,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
"# Define your task here:\n",
|
|
"# Define your task here:\n",
|
|
- "# task = 'Find toys to buy for my 10 year old niece this Christmas'\n",
|
|
|
|
- "# task = 'Find tickets for the next Warriors game'\n",
|
|
|
|
|
|
+ "#task = 'Find toys to buy for my 10 year old niece this Christmas'\n",
|
|
|
|
+ "#task = 'Find tickets for the next Warriors game'\n",
|
|
"task = 'Find the cheapest flight to Madrid'"
|
|
"task = 'Find the cheapest flight to Madrid'"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -513,7 +511,7 @@
|
|
"source": [
|
|
"source": [
|
|
"print(\"Generating plan...\")\n",
|
|
"print(\"Generating plan...\")\n",
|
|
"planning_response = client.chat.completions.create(\n",
|
|
"planning_response = client.chat.completions.create(\n",
|
|
- " model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
|
|
|
|
|
|
+ " model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
|
|
" temperature=0.0,\n",
|
|
" temperature=0.0,\n",
|
|
" messages=[\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": planning_prompt},\n",
|
|
" {\"role\": \"system\", \"content\": planning_prompt},\n",
|
|
@@ -567,7 +565,7 @@
|
|
" base64_image = encode_image(imagePath)\n",
|
|
" base64_image = encode_image(imagePath)\n",
|
|
" previous_context = accessibility_tree\n",
|
|
" previous_context = accessibility_tree\n",
|
|
" response = client.chat.completions.create(\n",
|
|
" response = client.chat.completions.create(\n",
|
|
- " model=\"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo\",\n",
|
|
|
|
|
|
+ " model=\"meta-llama/Llama-4-Scout-17B-16E-Instruct\",\n",
|
|
" temperature=0.0,\n",
|
|
" temperature=0.0,\n",
|
|
" messages=[\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": execution_prompt},\n",
|
|
" {\"role\": \"system\", \"content\": execution_prompt},\n",
|
|
@@ -590,6 +588,11 @@
|
|
" ],\n",
|
|
" ],\n",
|
|
" )\n",
|
|
" )\n",
|
|
" res = response.choices[0].message.content\n",
|
|
" res = response.choices[0].message.content\n",
|
|
|
|
+ " #added to remove invisible characters, whitespaces and commas:\n",
|
|
|
|
+ " # Remove any trailing commas\n",
|
|
|
|
+ " res = res.rstrip(',')\n",
|
|
|
|
+ " # Remove any invisible characters\n",
|
|
|
|
+ " res = ''.join(c for c in res if ord(c) >= 32 or ord(c) == 10 or ord(c) == 13)\n",
|
|
" print('Agent response:', res)\n",
|
|
" print('Agent response:', res)\n",
|
|
" try:\n",
|
|
" try:\n",
|
|
" match = re.search(r'\\{.*\\}', res, re.DOTALL)\n",
|
|
" match = re.search(r'\\{.*\\}', res, re.DOTALL)\n",
|
|
@@ -604,7 +607,7 @@
|
|
" previous_actions.append(f\"navigated to {output['url']}, SUCCESS\")\n",
|
|
" previous_actions.append(f\"navigated to {output['url']}, SUCCESS\")\n",
|
|
" except Exception as e:\n",
|
|
" except Exception as e:\n",
|
|
" previous_actions.append(f\"Error navigating to {output['url']}: {e}\")\n",
|
|
" previous_actions.append(f\"Error navigating to {output['url']}: {e}\")\n",
|
|
- "\n",
|
|
|
|
|
|
+ " \n",
|
|
" elif output[\"action\"] == \"click\":\n",
|
|
" elif output[\"action\"] == \"click\":\n",
|
|
" try:\n",
|
|
" try:\n",
|
|
" selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
|
|
" selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
|
|
@@ -612,7 +615,7 @@
|
|
" previous_actions.append(f\"clicked {output['selector']}, SUCCESS\")\n",
|
|
" previous_actions.append(f\"clicked {output['selector']}, SUCCESS\")\n",
|
|
" except Exception as e:\n",
|
|
" except Exception as e:\n",
|
|
" previous_actions.append(f\"Error clicking on {output['selector']}: {e}\")\n",
|
|
" previous_actions.append(f\"Error clicking on {output['selector']}: {e}\")\n",
|
|
- " \n",
|
|
|
|
|
|
+ "\n",
|
|
" elif output[\"action\"] == \"fill\":\n",
|
|
" elif output[\"action\"] == \"fill\":\n",
|
|
" try:\n",
|
|
" try:\n",
|
|
" selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
|
|
" selector_type, selector_name = output[\"selector\"].split(\"=\")[0], output[\"selector\"].split(\"=\")[1]\n",
|
|
@@ -666,9 +669,9 @@
|
|
],
|
|
],
|
|
"metadata": {
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"kernelspec": {
|
|
- "display_name": "base",
|
|
|
|
|
|
+ "display_name": "browser-use",
|
|
"language": "python",
|
|
"language": "python",
|
|
- "name": "python3"
|
|
|
|
|
|
+ "name": "browser-use"
|
|
},
|
|
},
|
|
"language_info": {
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"codemirror_mode": {
|
|
@@ -680,9 +683,9 @@
|
|
"name": "python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"pygments_lexer": "ipython3",
|
|
- "version": "3.11.3"
|
|
|
|
|
|
+ "version": "3.12.7"
|
|
}
|
|
}
|
|
},
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat": 4,
|
|
- "nbformat_minor": 2
|
|
|
|
|
|
+ "nbformat_minor": 4
|
|
}
|
|
}
|