Skip to content

Commit cc113b9

Browse files
authored
Merge pull request #1338 from patched-codes/feature/BrowserUse
Introduce BrowserUse step
2 parents e33580f + b43531d commit cc113b9

File tree

10 files changed

+1164
-21
lines changed

10 files changed

+1164
-21
lines changed

.github/workflows/release.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ name: Build
33
on:
44
push:
55
tags:
6-
- 'v*.*.*'
7-
6+
- "v*.*.*"
87

98
jobs:
109
publish-to-pypi:
@@ -18,7 +17,7 @@ jobs:
1817
id: setup-python
1918
uses: actions/setup-python@v5
2019
with:
21-
python-version: '3.9'
20+
python-version: "3.11"
2221

2322
- name: Install Poetry
2423
uses: snok/install-poetry@v1

.github/workflows/test.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
id: setup-python
4141
uses: actions/setup-python@v5
4242
with:
43-
python-version: '3.9'
43+
python-version: "3.11"
4444

4545
- name: Install Poetry
4646
uses: snok/install-poetry@v1
@@ -78,7 +78,7 @@ jobs:
7878
id: setup-python
7979
uses: actions/setup-python@v5
8080
with:
81-
python-version: '3.9'
81+
python-version: "3.11"
8282

8383
- name: Install Poetry
8484
uses: snok/install-poetry@v1
@@ -133,7 +133,7 @@ jobs:
133133
id: setup-python
134134
uses: actions/setup-python@v5
135135
with:
136-
python-version: '3.9'
136+
python-version: "3.11"
137137

138138
- name: Install Poetry
139139
uses: snok/install-poetry@v1
@@ -171,7 +171,7 @@ jobs:
171171
--base_path=tests/cicd/generate_docstring \
172172
--disable_telemetry
173173
174-
- name : Generate Diagram
174+
- name: Generate Diagram
175175
run: |
176176
source .venv/bin/activate
177177
patchwork GenerateDiagram --log debug \
@@ -188,7 +188,7 @@ jobs:
188188
--github_api_key=${{ secrets.SCM_GITHUB_KEY }} \
189189
--folder_path=tests/cicd/generate_docstring \
190190
--disable_telemetry
191-
191+
192192
- name: Generate Code Usage Example
193193
run: |
194194
source .venv/bin/activate
@@ -204,15 +204,15 @@ jobs:
204204
# Specify the parent folder you want to check
205205
PARENT_FOLDER="./patchwork/steps"
206206
# Command to run if README.md is not found
207-
207+
208208
find "$PARENT_FOLDER" -mindepth 1 -maxdepth 1 -type d | grep -vE '/\.\.?/' | grep -vE '/__' | while read -r dir; do
209209
if [[ ! -f "$dir/README.md" ]]; then
210210
echo "No README.md in $dir"
211211
# Extract the last part of the path to use as a base for the branch name
212212
base_name=$(basename "$dir")
213213
# Convert to a Git-friendly branch name: replace spaces with underscores, remove slashes, etc.
214214
branch_name=$(echo "$base_name" | sed -e 's/[^a-zA-Z0-9]/_/g' -e 's/__*/_/g' -e 's/^_//g' -e 's/_$//g')
215-
215+
216216
patchwork GenerateREADME --log debug \
217217
--patched_api_key=${{ secrets.PATCHED_API_KEY }} \
218218
--github_api_key=${{ secrets.SCM_GITHUB_KEY }} \

patchwork/common/client/llm/google.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def is_model_supported(self, model: str) -> bool:
111111
return model in self.get_models()
112112

113113
def __upload(self, file: Path | NotGiven) -> Part | File | None:
114-
if file is NotGiven:
114+
if isinstance(file, NotGiven):
115115
return None
116116

117117
file_bytes = file.read_bytes()
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
import asyncio
2+
import logging
3+
import os
4+
from datetime import datetime
5+
6+
from patchwork.step import Step
7+
from patchwork.steps import SimplifiedLLMOnce
8+
from patchwork.steps.BrowserUse.typed import BrowserUseInputs, BrowserUseOutputs
9+
10+
logger = logging.getLogger(__name__)
11+
12+
# Global variables to cache browser initialization
13+
_browser = None
14+
_controller = None
15+
16+
17+
def init_browser():
18+
"""
19+
Initialize and cache browser and controller instances.
20+
21+
This function uses a singleton pattern to ensure we only create one browser
22+
instance throughout the application lifecycle, which saves resources.
23+
24+
Returns:
25+
tuple: (Browser, Controller) instances for web automation
26+
"""
27+
global _browser, _controller
28+
29+
# Return cached instances if already initialized
30+
if _browser is not None and _controller is not None:
31+
return _browser, _controller
32+
33+
from browser_use import Browser, BrowserConfig, BrowserContextConfig, Controller
34+
from browser_use.agent.views import ActionResult
35+
from browser_use.browser.context import BrowserContext
36+
37+
# Set up downloads directory for browser operations
38+
downloads_path = os.path.join(os.getcwd(), "downloads")
39+
if not os.path.exists(downloads_path):
40+
os.makedirs(downloads_path)
41+
42+
context_config = BrowserContextConfig(save_downloads_path=downloads_path)
43+
config = BrowserConfig(
44+
headless=True, disable_security=True, new_context_config=context_config
45+
)
46+
controller = Controller()
47+
48+
# Register custom action to upload files to web elements
49+
@controller.action(
50+
description="Upload file to interactive element with file path",
51+
)
52+
async def upload_file(index: int, path: str, browser: BrowserContext):
53+
"""
54+
Upload a file to a file input element identified by its index.
55+
56+
Args:
57+
index: The DOM element index to target
58+
path: Local file path to upload
59+
browser: Browser context for interaction
60+
61+
Returns:
62+
ActionResult: Result of the upload operation
63+
"""
64+
if not os.path.exists(path):
65+
return ActionResult(error=f"File {path} does not exist")
66+
67+
dom_el = await browser.get_dom_element_by_index(index)
68+
file_upload_dom_el = dom_el.get_file_upload_element()
69+
70+
if file_upload_dom_el is None:
71+
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
72+
logger.info(msg)
73+
return ActionResult(error=msg)
74+
75+
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
76+
77+
if file_upload_el is None:
78+
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
79+
logger.info(msg)
80+
return ActionResult(error=msg)
81+
82+
try:
83+
await file_upload_el.set_input_files(path)
84+
msg = f"Successfully uploaded file to index {index}"
85+
logger.info(msg)
86+
return ActionResult(extracted_content=msg, include_in_memory=True)
87+
except Exception as e:
88+
msg = f"Failed to upload file to index {index}: {str(e)}"
89+
logger.info(msg)
90+
return ActionResult(error=msg)
91+
92+
# Register custom action to read file contents
93+
@controller.action(description="Read the file content of a file given a path")
94+
async def read_file(path: str):
95+
"""
96+
Read and return the contents of a file at the specified path.
97+
98+
Args:
99+
path: Path to the file to read
100+
101+
Returns:
102+
ActionResult: File contents or error message
103+
"""
104+
if not os.path.exists(path):
105+
return ActionResult(error=f"File {path} does not exist")
106+
107+
with open(path, "r") as f:
108+
content = f.read()
109+
msg = f"File content: {content}"
110+
logger.info(msg)
111+
return ActionResult(extracted_content=msg, include_in_memory=True)
112+
113+
# Cache the initialized instances
114+
_browser = Browser(config=config)
115+
_controller = controller
116+
117+
return _browser, _controller
118+
119+
120+
class BrowserUse(Step, input_class=BrowserUseInputs, output_class=BrowserUseOutputs):
121+
"""
122+
Step implementation for browser automation tasks.
123+
124+
This class provides a high-level interface for executing browser-based tasks
125+
using various LLM providers (Google, OpenAI, Anthropic) to control the browser.
126+
"""
127+
required_keys = {"task"}
128+
129+
def __init__(self, inputs):
130+
"""
131+
Initialize the BrowserUse step with configuration inputs.
132+
133+
Args:
134+
inputs: Dictionary containing configuration parameters (see: BrowserUseInputs)
135+
"""
136+
super().__init__(inputs)
137+
138+
if not all(key in inputs.keys() for key in self.required_keys):
139+
raise ValueError(f'Missing required data: "{self.required_keys}"')
140+
141+
# Configure the appropriate LLM based on provided API keys
142+
if "google_api_key" in self.inputs:
143+
from langchain_google_genai import ChatGoogleGenerativeAI
144+
145+
self.llm = ChatGoogleGenerativeAI(
146+
model="gemini-2.0-flash", google_api_key=self.inputs["google_api_key"]
147+
)
148+
elif "openai_api_key" in self.inputs:
149+
from langchain_openai import ChatOpenAI
150+
151+
self.llm = ChatOpenAI(model="gpt-4o", api_key=self.inputs["openai_api_key"])
152+
elif "anthropic_api_key" in self.inputs:
153+
from langchain_anthropic import ChatAnthropic
154+
155+
self.llm = ChatAnthropic(
156+
model="claude-3-7-sonnet-latest",
157+
api_key=self.inputs["anthropic_api_key"],
158+
)
159+
160+
# Configure GIF generation for debugging/visualization
161+
self.generate_gif = (
162+
f"agent_history_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.gif"
163+
if ("generate_gif" in self.inputs and self.inputs["generate_gif"])
164+
or ("debug" in self.inputs and self.inputs["debug"])
165+
else False
166+
)
167+
168+
def run(self) -> dict:
169+
"""
170+
Execute the browser automation task.
171+
172+
This method initializes the browser agent, runs the specified task,
173+
and returns the results, optionally formatting them as JSON.
174+
175+
Returns:
176+
dict: Results of the browser automation task
177+
"""
178+
from browser_use import Agent
179+
180+
browser, controller = init_browser()
181+
agent = Agent(
182+
browser=browser,
183+
controller=controller,
184+
task=self.inputs["task"],
185+
llm=self.llm,
186+
generate_gif=self.generate_gif,
187+
validate_output=True,
188+
)
189+
190+
# Run the agent in an event loop
191+
loop = asyncio.new_event_loop()
192+
self.history = loop.run_until_complete(agent.run())
193+
194+
# Format results as JSON if schema provided
195+
if "example_json" in self.inputs:
196+
return self.__format_history_as_json()
197+
198+
return {
199+
"history": self.history,
200+
"result": self.history.final_result(),
201+
"generated_gif": self.generate_gif,
202+
}
203+
204+
def __format_history_as_json(self):
205+
"""
206+
Format browser history as JSON using an LLM.
207+
208+
Uses the same LLM provider as the main task to convert
209+
the browser history into a structured JSON format based
210+
on the provided schema.
211+
212+
Returns:
213+
dict: Formatted JSON result
214+
"""
215+
inputs = dict(
216+
user_prompt=f"""
217+
You are a helpful assistant that formats a history of browser actions and conversations into a JSON object.
218+
You are provided with a JSON schema for the history.
219+
Only include the JSON object in your response, nothing else.
220+
221+
Here is the history:
222+
<history>
223+
{self.history.final_result()}
224+
</history>
225+
""",
226+
json_schema=self.inputs["example_json"],
227+
prompt_value=dict(),
228+
)
229+
230+
if "google_api_key" in self.inputs:
231+
inputs["google_api_key"] = self.inputs["google_api_key"]
232+
inputs["model"] = "gemini-2.0-flash"
233+
elif "openai_api_key" in self.inputs:
234+
inputs["openai_api_key"] = self.inputs["openai_api_key"]
235+
inputs["model"] = "gpt-4o-mini"
236+
elif "anthropic_api_key" in self.inputs:
237+
inputs["anthropic_api_key"] = self.inputs["anthropic_api_key"]
238+
inputs["model"] = "claude-3-5-haiku-latest"
239+
return SimplifiedLLMOnce(inputs).run()

0 commit comments

Comments
 (0)