Skip to content

Commit b43531d

Browse files
committed
Fix browser use imports etc
1 parent dc9b21e commit b43531d

File tree

1 file changed

+150
-60
lines changed

1 file changed

+150
-60
lines changed

patchwork/steps/BrowserUse/BrowserUse.py

Lines changed: 150 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,98 +1,163 @@
11
import asyncio
22
import logging
33
import os
4-
5-
from browser_use import Agent, Browser, BrowserConfig, BrowserContextConfig, Controller
6-
from browser_use.agent.views import ActionResult
7-
from browser_use.browser.context import BrowserContext
8-
from langchain_google_genai import ChatGoogleGenerativeAI
9-
from langchain_openai import ChatOpenAI
10-
from langchain_anthropic import ChatAnthropic
114
from datetime import datetime
125

136
from patchwork.step import Step
147
from patchwork.steps import SimplifiedLLMOnce
158
from patchwork.steps.BrowserUse.typed import BrowserUseInputs, BrowserUseOutputs
169

17-
downloads_path = os.path.join(os.getcwd(), "downloads")
1810
logger = logging.getLogger(__name__)
19-
context_config = BrowserContextConfig(save_downloads_path=downloads_path)
20-
config = BrowserConfig(
21-
headless=True, disable_security=True, new_context_config=context_config
22-
)
23-
controller = Controller()
24-
25-
if not os.path.exists(downloads_path):
26-
os.makedirs(downloads_path)
27-
28-
29-
@controller.action(
30-
description="Upload file to interactive element with file path",
31-
)
32-
async def upload_file(index: int, path: str, browser: BrowserContext):
33-
if not os.path.exists(path):
34-
return ActionResult(error=f"File {path} does not exist")
35-
36-
dom_el = await browser.get_dom_element_by_index(index)
37-
file_upload_dom_el = dom_el.get_file_upload_element()
38-
39-
if file_upload_dom_el is None:
40-
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
41-
logger.info(msg)
42-
return ActionResult(error=msg)
4311

44-
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
45-
46-
if file_upload_el is None:
47-
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
48-
logger.info(msg)
49-
return ActionResult(error=msg)
50-
51-
try:
52-
await file_upload_el.set_input_files(path)
53-
msg = f"Successfully uploaded file to index {index}"
12+
# Global variables to cache browser initialization
13+
_browser = None
14+
_controller = None
15+
16+
17+
def init_browser():
18+
"""
19+
Initialize and cache browser and controller instances.
20+
21+
This function uses a singleton pattern to ensure we only create one browser
22+
instance throughout the application lifecycle, which saves resources.
23+
24+
Returns:
25+
tuple: (Browser, Controller) instances for web automation
26+
"""
27+
global _browser, _controller
28+
29+
# Return cached instances if already initialized
30+
if _browser is not None and _controller is not None:
31+
return _browser, _controller
32+
33+
from browser_use import Browser, BrowserConfig, BrowserContextConfig, Controller
34+
from browser_use.agent.views import ActionResult
35+
from browser_use.browser.context import BrowserContext
36+
37+
# Set up downloads directory for browser operations
38+
downloads_path = os.path.join(os.getcwd(), "downloads")
39+
if not os.path.exists(downloads_path):
40+
os.makedirs(downloads_path)
41+
42+
context_config = BrowserContextConfig(save_downloads_path=downloads_path)
43+
config = BrowserConfig(
44+
headless=True, disable_security=True, new_context_config=context_config
45+
)
46+
controller = Controller()
47+
48+
# Register custom action to upload files to web elements
49+
@controller.action(
50+
description="Upload file to interactive element with file path",
51+
)
52+
async def upload_file(index: int, path: str, browser: BrowserContext):
53+
"""
54+
Upload a file to a file input element identified by its index.
55+
56+
Args:
57+
index: The DOM element index to target
58+
path: Local file path to upload
59+
browser: Browser context for interaction
60+
61+
Returns:
62+
ActionResult: Result of the upload operation
63+
"""
64+
if not os.path.exists(path):
65+
return ActionResult(error=f"File {path} does not exist")
66+
67+
dom_el = await browser.get_dom_element_by_index(index)
68+
file_upload_dom_el = dom_el.get_file_upload_element()
69+
70+
if file_upload_dom_el is None:
71+
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
72+
logger.info(msg)
73+
return ActionResult(error=msg)
74+
75+
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
76+
77+
if file_upload_el is None:
78+
msg = f"No file upload element found at index {index}. The element may be hidden or not an input type file"
79+
logger.info(msg)
80+
return ActionResult(error=msg)
81+
82+
try:
83+
await file_upload_el.set_input_files(path)
84+
msg = f"Successfully uploaded file to index {index}"
85+
logger.info(msg)
86+
return ActionResult(extracted_content=msg, include_in_memory=True)
87+
except Exception as e:
88+
msg = f"Failed to upload file to index {index}: {str(e)}"
89+
logger.info(msg)
90+
return ActionResult(error=msg)
91+
92+
# Register custom action to read file contents
93+
@controller.action(description="Read the file content of a file given a path")
94+
async def read_file(path: str):
95+
"""
96+
Read and return the contents of a file at the specified path.
97+
98+
Args:
99+
path: Path to the file to read
100+
101+
Returns:
102+
ActionResult: File contents or error message
103+
"""
104+
if not os.path.exists(path):
105+
return ActionResult(error=f"File {path} does not exist")
106+
107+
with open(path, "r") as f:
108+
content = f.read()
109+
msg = f"File content: {content}"
54110
logger.info(msg)
55111
return ActionResult(extracted_content=msg, include_in_memory=True)
56-
except Exception as e:
57-
msg = f"Failed to upload file to index {index}: {str(e)}"
58-
logger.info(msg)
59-
return ActionResult(error=msg)
60112

113+
# Cache the initialized instances
114+
_browser = Browser(config=config)
115+
_controller = controller
61116

62-
@controller.action(description="Read the file content of a file given a path")
63-
async def read_file(path: str):
64-
if not os.path.exists(path):
65-
return ActionResult(error=f"File {path} does not exist")
66-
67-
with open(path, "r") as f:
68-
content = f.read()
69-
msg = f"File content: {content}"
70-
logger.info(msg)
71-
return ActionResult(extracted_content=msg, include_in_memory=True)
117+
return _browser, _controller
72118

73119

74120
class BrowserUse(Step, input_class=BrowserUseInputs, output_class=BrowserUseOutputs):
121+
"""
122+
Step implementation for browser automation tasks.
123+
124+
This class provides a high-level interface for executing browser-based tasks
125+
using various LLM providers (Google, OpenAI, Anthropic) to control the browser.
126+
"""
75127
required_keys = {"task"}
76128

77129
def __init__(self, inputs):
130+
"""
131+
Initialize the BrowserUse step with configuration inputs.
132+
133+
Args:
134+
inputs: Dictionary containing configuration parameters (see: BrowserUseInputs)
135+
"""
78136
super().__init__(inputs)
79137

80138
if not all(key in inputs.keys() for key in self.required_keys):
81139
raise ValueError(f'Missing required data: "{self.required_keys}"')
82140

83-
self.browser = Browser(config=config)
84-
141+
# Configure the appropriate LLM based on provided API keys
85142
if "google_api_key" in self.inputs:
143+
from langchain_google_genai import ChatGoogleGenerativeAI
144+
86145
self.llm = ChatGoogleGenerativeAI(
87146
model="gemini-2.0-flash", google_api_key=self.inputs["google_api_key"]
88147
)
89148
elif "openai_api_key" in self.inputs:
149+
from langchain_openai import ChatOpenAI
150+
90151
self.llm = ChatOpenAI(model="gpt-4o", api_key=self.inputs["openai_api_key"])
91152
elif "anthropic_api_key" in self.inputs:
153+
from langchain_anthropic import ChatAnthropic
154+
92155
self.llm = ChatAnthropic(
93156
model="claude-3-7-sonnet-latest",
94157
api_key=self.inputs["anthropic_api_key"],
95158
)
159+
160+
# Configure GIF generation for debugging/visualization
96161
self.generate_gif = (
97162
f"agent_history_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.gif"
98163
if ("generate_gif" in self.inputs and self.inputs["generate_gif"])
@@ -101,17 +166,32 @@ def __init__(self, inputs):
101166
)
102167

103168
def run(self) -> dict:
169+
"""
170+
Execute the browser automation task.
171+
172+
This method initializes the browser agent, runs the specified task,
173+
and returns the results, optionally formatting them as JSON.
174+
175+
Returns:
176+
dict: Results of the browser automation task
177+
"""
178+
from browser_use import Agent
179+
180+
browser, controller = init_browser()
104181
agent = Agent(
105-
browser=self.browser,
182+
browser=browser,
183+
controller=controller,
106184
task=self.inputs["task"],
107185
llm=self.llm,
108186
generate_gif=self.generate_gif,
109187
validate_output=True,
110-
controller=controller,
111188
)
112189

190+
# Run the agent in an event loop
113191
loop = asyncio.new_event_loop()
114192
self.history = loop.run_until_complete(agent.run())
193+
194+
# Format results as JSON if schema provided
115195
if "example_json" in self.inputs:
116196
return self.__format_history_as_json()
117197

@@ -122,6 +202,16 @@ def run(self) -> dict:
122202
}
123203

124204
def __format_history_as_json(self):
205+
"""
206+
Format browser history as JSON using an LLM.
207+
208+
Uses the same LLM provider as the main task to convert
209+
the browser history into a structured JSON format based
210+
on the provided schema.
211+
212+
Returns:
213+
dict: Formatted JSON result
214+
"""
125215
inputs = dict(
126216
user_prompt=f"""
127217
You are a helpful assistant that formats a history of browser actions and conversations into a JSON object.

0 commit comments

Comments
 (0)