11import asyncio
22import logging
33import os
4-
5- from browser_use import Agent , Browser , BrowserConfig , BrowserContextConfig , Controller
6- from browser_use .agent .views import ActionResult
7- from browser_use .browser .context import BrowserContext
8- from langchain_google_genai import ChatGoogleGenerativeAI
9- from langchain_openai import ChatOpenAI
10- from langchain_anthropic import ChatAnthropic
114from datetime import datetime
125
136from patchwork .step import Step
147from patchwork .steps import SimplifiedLLMOnce
158from patchwork .steps .BrowserUse .typed import BrowserUseInputs , BrowserUseOutputs
169
17- downloads_path = os .path .join (os .getcwd (), "downloads" )
1810logger = logging .getLogger (__name__ )
19- context_config = BrowserContextConfig (save_downloads_path = downloads_path )
20- config = BrowserConfig (
21- headless = True , disable_security = True , new_context_config = context_config
22- )
23- controller = Controller ()
24-
25- if not os .path .exists (downloads_path ):
26- os .makedirs (downloads_path )
27-
28-
29- @controller .action (
30- description = "Upload file to interactive element with file path" ,
31- )
32- async def upload_file (index : int , path : str , browser : BrowserContext ):
33- if not os .path .exists (path ):
34- return ActionResult (error = f"File { path } does not exist" )
35-
36- dom_el = await browser .get_dom_element_by_index (index )
37- file_upload_dom_el = dom_el .get_file_upload_element ()
38-
39- if file_upload_dom_el is None :
40- msg = f"No file upload element found at index { index } . The element may be hidden or not an input type file"
41- logger .info (msg )
42- return ActionResult (error = msg )
4311
44- file_upload_el = await browser .get_locate_element (file_upload_dom_el )
45-
46- if file_upload_el is None :
47- msg = f"No file upload element found at index { index } . The element may be hidden or not an input type file"
48- logger .info (msg )
49- return ActionResult (error = msg )
50-
51- try :
52- await file_upload_el .set_input_files (path )
53- msg = f"Successfully uploaded file to index { index } "
12+ # Global variables to cache browser initialization
13+ _browser = None
14+ _controller = None
15+
16+
17+ def init_browser ():
18+ """
19+ Initialize and cache browser and controller instances.
20+
21+ This function uses a singleton pattern to ensure we only create one browser
22+ instance throughout the application lifecycle, which saves resources.
23+
24+ Returns:
25+ tuple: (Browser, Controller) instances for web automation
26+ """
27+ global _browser , _controller
28+
29+ # Return cached instances if already initialized
30+ if _browser is not None and _controller is not None :
31+ return _browser , _controller
32+
33+ from browser_use import Browser , BrowserConfig , BrowserContextConfig , Controller
34+ from browser_use .agent .views import ActionResult
35+ from browser_use .browser .context import BrowserContext
36+
37+ # Set up downloads directory for browser operations
38+ downloads_path = os .path .join (os .getcwd (), "downloads" )
39+ if not os .path .exists (downloads_path ):
40+ os .makedirs (downloads_path )
41+
42+ context_config = BrowserContextConfig (save_downloads_path = downloads_path )
43+ config = BrowserConfig (
44+ headless = True , disable_security = True , new_context_config = context_config
45+ )
46+ controller = Controller ()
47+
48+ # Register custom action to upload files to web elements
49+ @controller .action (
50+ description = "Upload file to interactive element with file path" ,
51+ )
52+ async def upload_file (index : int , path : str , browser : BrowserContext ):
53+ """
54+ Upload a file to a file input element identified by its index.
55+
56+ Args:
57+ index: The DOM element index to target
58+ path: Local file path to upload
59+ browser: Browser context for interaction
60+
61+ Returns:
62+ ActionResult: Result of the upload operation
63+ """
64+ if not os .path .exists (path ):
65+ return ActionResult (error = f"File { path } does not exist" )
66+
67+ dom_el = await browser .get_dom_element_by_index (index )
68+ file_upload_dom_el = dom_el .get_file_upload_element ()
69+
70+ if file_upload_dom_el is None :
71+ msg = f"No file upload element found at index { index } . The element may be hidden or not an input type file"
72+ logger .info (msg )
73+ return ActionResult (error = msg )
74+
75+ file_upload_el = await browser .get_locate_element (file_upload_dom_el )
76+
77+ if file_upload_el is None :
78+ msg = f"No file upload element found at index { index } . The element may be hidden or not an input type file"
79+ logger .info (msg )
80+ return ActionResult (error = msg )
81+
82+ try :
83+ await file_upload_el .set_input_files (path )
84+ msg = f"Successfully uploaded file to index { index } "
85+ logger .info (msg )
86+ return ActionResult (extracted_content = msg , include_in_memory = True )
87+ except Exception as e :
88+ msg = f"Failed to upload file to index { index } : { str (e )} "
89+ logger .info (msg )
90+ return ActionResult (error = msg )
91+
92+ # Register custom action to read file contents
93+ @controller .action (description = "Read the file content of a file given a path" )
94+ async def read_file (path : str ):
95+ """
96+ Read and return the contents of a file at the specified path.
97+
98+ Args:
99+ path: Path to the file to read
100+
101+ Returns:
102+ ActionResult: File contents or error message
103+ """
104+ if not os .path .exists (path ):
105+ return ActionResult (error = f"File { path } does not exist" )
106+
107+ with open (path , "r" ) as f :
108+ content = f .read ()
109+ msg = f"File content: { content } "
54110 logger .info (msg )
55111 return ActionResult (extracted_content = msg , include_in_memory = True )
56- except Exception as e :
57- msg = f"Failed to upload file to index { index } : { str (e )} "
58- logger .info (msg )
59- return ActionResult (error = msg )
60112
113+ # Cache the initialized instances
114+ _browser = Browser (config = config )
115+ _controller = controller
61116
62- @controller .action (description = "Read the file content of a file given a path" )
63- async def read_file (path : str ):
64- if not os .path .exists (path ):
65- return ActionResult (error = f"File { path } does not exist" )
66-
67- with open (path , "r" ) as f :
68- content = f .read ()
69- msg = f"File content: { content } "
70- logger .info (msg )
71- return ActionResult (extracted_content = msg , include_in_memory = True )
117+ return _browser , _controller
72118
73119
74120class BrowserUse (Step , input_class = BrowserUseInputs , output_class = BrowserUseOutputs ):
121+ """
122+ Step implementation for browser automation tasks.
123+
124+ This class provides a high-level interface for executing browser-based tasks
125+ using various LLM providers (Google, OpenAI, Anthropic) to control the browser.
126+ """
75127 required_keys = {"task" }
76128
77129 def __init__ (self , inputs ):
130+ """
131+ Initialize the BrowserUse step with configuration inputs.
132+
133+ Args:
134+ inputs: Dictionary containing configuration parameters (see: BrowserUseInputs)
135+ """
78136 super ().__init__ (inputs )
79137
80138 if not all (key in inputs .keys () for key in self .required_keys ):
81139 raise ValueError (f'Missing required data: "{ self .required_keys } "' )
82140
83- self .browser = Browser (config = config )
84-
141+ # Configure the appropriate LLM based on provided API keys
85142 if "google_api_key" in self .inputs :
143+ from langchain_google_genai import ChatGoogleGenerativeAI
144+
86145 self .llm = ChatGoogleGenerativeAI (
87146 model = "gemini-2.0-flash" , google_api_key = self .inputs ["google_api_key" ]
88147 )
89148 elif "openai_api_key" in self .inputs :
149+ from langchain_openai import ChatOpenAI
150+
90151 self .llm = ChatOpenAI (model = "gpt-4o" , api_key = self .inputs ["openai_api_key" ])
91152 elif "anthropic_api_key" in self .inputs :
153+ from langchain_anthropic import ChatAnthropic
154+
92155 self .llm = ChatAnthropic (
93156 model = "claude-3-7-sonnet-latest" ,
94157 api_key = self .inputs ["anthropic_api_key" ],
95158 )
159+
160+ # Configure GIF generation for debugging/visualization
96161 self .generate_gif = (
97162 f"agent_history_{ datetime .now ().strftime ('%Y-%m-%d_%H-%M-%S' )} .gif"
98163 if ("generate_gif" in self .inputs and self .inputs ["generate_gif" ])
@@ -101,17 +166,32 @@ def __init__(self, inputs):
101166 )
102167
103168 def run (self ) -> dict :
169+ """
170+ Execute the browser automation task.
171+
172+ This method initializes the browser agent, runs the specified task,
173+ and returns the results, optionally formatting them as JSON.
174+
175+ Returns:
176+ dict: Results of the browser automation task
177+ """
178+ from browser_use import Agent
179+
180+ browser , controller = init_browser ()
104181 agent = Agent (
105- browser = self .browser ,
182+ browser = browser ,
183+ controller = controller ,
106184 task = self .inputs ["task" ],
107185 llm = self .llm ,
108186 generate_gif = self .generate_gif ,
109187 validate_output = True ,
110- controller = controller ,
111188 )
112189
190+ # Run the agent in an event loop
113191 loop = asyncio .new_event_loop ()
114192 self .history = loop .run_until_complete (agent .run ())
193+
194+ # Format results as JSON if schema provided
115195 if "example_json" in self .inputs :
116196 return self .__format_history_as_json ()
117197
@@ -122,6 +202,16 @@ def run(self) -> dict:
122202 }
123203
124204 def __format_history_as_json (self ):
205+ """
206+ Format browser history as JSON using an LLM.
207+
208+ Uses the same LLM provider as the main task to convert
209+ the browser history into a structured JSON format based
210+ on the provided schema.
211+
212+ Returns:
213+ dict: Formatted JSON result
214+ """
125215 inputs = dict (
126216 user_prompt = f"""
127217You are a helpful assistant that formats a history of browser actions and conversations into a JSON object.
0 commit comments