@@ -442,13 +442,15 @@ async def handle_crawl_request(
442442 urls : List [str ],
443443 browser_config : dict ,
444444 crawler_config : dict ,
445- config : dict
445+ config : dict ,
446+ hooks_config : Optional [dict ] = None
446447) -> dict :
447- """Handle non-streaming crawl requests."""
448+ """Handle non-streaming crawl requests with optional hooks ."""
448449 start_mem_mb = _get_memory_mb () # <--- Get memory before
449450 start_time = time .time ()
450451 mem_delta_mb = None
451452 peak_mem_mb = start_mem_mb
453+ hook_manager = None
452454
453455 try :
454456 urls = [('https://' + url ) if not url .startswith (('http://' , 'https://' )) and not url .startswith (("raw:" , "raw://" )) else url for url in urls ]
@@ -468,6 +470,19 @@ async def handle_crawl_request(
468470 # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
469471 # await crawler.start()
470472
473+ # Attach hooks if provided
474+ hooks_status = {}
475+ if hooks_config :
476+ from hook_manager import attach_user_hooks_to_crawler , UserHookManager
477+ hook_manager = UserHookManager (timeout = hooks_config .get ('timeout' , 30 ))
478+ hooks_status , hook_manager = await attach_user_hooks_to_crawler (
479+ crawler ,
480+ hooks_config .get ('code' , {}),
481+ timeout = hooks_config .get ('timeout' , 30 ),
482+ hook_manager = hook_manager
483+ )
484+ logger .info (f"Hooks attachment status: { hooks_status ['status' ]} " )
485+
471486 base_config = config ["crawler" ]["base_config" ]
472487 # Iterate on key-value pairs in global_config then use hasattr to set them
473488 for key , value in base_config .items ():
@@ -484,6 +499,10 @@ async def handle_crawl_request(
484499 config = crawler_config ,
485500 dispatcher = dispatcher )
486501 results = await partial_func ()
502+
503+ # Ensure results is always a list
504+ if not isinstance (results , list ):
505+ results = [results ]
487506
488507 # await crawler.close()
489508
@@ -498,22 +517,72 @@ async def handle_crawl_request(
498517 # Process results to handle PDF bytes
499518 processed_results = []
500519 for result in results :
501- result_dict = result .model_dump ()
502- # if fit_html is not a string, set it to None to avoid serialization errors
503- if "fit_html" in result_dict and not (result_dict ["fit_html" ] is None or isinstance (result_dict ["fit_html" ], str )):
504- result_dict ["fit_html" ] = None
505- # If PDF exists, encode it to base64
506- if result_dict .get ('pdf' ) is not None :
507- result_dict ['pdf' ] = b64encode (result_dict ['pdf' ]).decode ('utf-8' )
508- processed_results .append (result_dict )
520+ try :
521+ # Check if result has model_dump method (is a proper CrawlResult)
522+ if hasattr (result , 'model_dump' ):
523+ result_dict = result .model_dump ()
524+ elif isinstance (result , dict ):
525+ result_dict = result
526+ else :
527+ # Handle unexpected result type
528+ logger .warning (f"Unexpected result type: { type (result )} " )
529+ result_dict = {
530+ "url" : str (result ) if hasattr (result , '__str__' ) else "unknown" ,
531+ "success" : False ,
532+ "error_message" : f"Unexpected result type: { type (result ).__name__ } "
533+ }
534+
535+ # if fit_html is not a string, set it to None to avoid serialization errors
536+ if "fit_html" in result_dict and not (result_dict ["fit_html" ] is None or isinstance (result_dict ["fit_html" ], str )):
537+ result_dict ["fit_html" ] = None
538+
539+ # If PDF exists, encode it to base64
540+ if result_dict .get ('pdf' ) is not None and isinstance (result_dict .get ('pdf' ), bytes ):
541+ result_dict ['pdf' ] = b64encode (result_dict ['pdf' ]).decode ('utf-8' )
542+
543+ processed_results .append (result_dict )
544+ except Exception as e :
545+ logger .error (f"Error processing result: { e } " )
546+ processed_results .append ({
547+ "url" : "unknown" ,
548+ "success" : False ,
549+ "error_message" : str (e )
550+ })
509551
510- return {
552+ response = {
511553 "success" : True ,
512554 "results" : processed_results ,
513555 "server_processing_time_s" : end_time - start_time ,
514556 "server_memory_delta_mb" : mem_delta_mb ,
515557 "server_peak_memory_mb" : peak_mem_mb
516558 }
559+
560+ # Add hooks information if hooks were used
561+ if hooks_config and hook_manager :
562+ from hook_manager import UserHookManager
563+ if isinstance (hook_manager , UserHookManager ):
564+ try :
565+ # Ensure all hook data is JSON serializable
566+ import json
567+ hook_data = {
568+ "status" : hooks_status ,
569+ "execution_log" : hook_manager .execution_log ,
570+ "errors" : hook_manager .errors ,
571+ "summary" : hook_manager .get_summary ()
572+ }
573+ # Test that it's serializable
574+ json .dumps (hook_data )
575+ response ["hooks" ] = hook_data
576+ except (TypeError , ValueError ) as e :
577+ logger .error (f"Hook data not JSON serializable: { e } " )
578+ response ["hooks" ] = {
579+ "status" : {"status" : "error" , "message" : "Hook data serialization failed" },
580+ "execution_log" : [],
581+ "errors" : [{"error" : str (e )}],
582+ "summary" : {}
583+ }
584+
585+ return response
517586
518587 except Exception as e :
519588 logger .error (f"Crawl error: { str (e )} " , exc_info = True )
@@ -542,9 +611,11 @@ async def handle_stream_crawl_request(
542611 urls : List [str ],
543612 browser_config : dict ,
544613 crawler_config : dict ,
545- config : dict
546- ) -> Tuple [AsyncWebCrawler , AsyncGenerator ]:
547- """Handle streaming crawl requests."""
614+ config : dict ,
615+ hooks_config : Optional [dict ] = None
616+ ) -> Tuple [AsyncWebCrawler , AsyncGenerator , Optional [Dict ]]:
617+ """Handle streaming crawl requests with optional hooks."""
618+ hooks_info = None
548619 try :
549620 browser_config = BrowserConfig .load (browser_config )
550621 # browser_config.verbose = True # Set to False or remove for production stress testing
@@ -565,14 +636,28 @@ async def handle_stream_crawl_request(
565636
566637 # crawler = AsyncWebCrawler(config=browser_config)
567638 # await crawler.start()
639+
640+ # Attach hooks if provided
641+ if hooks_config :
642+ from hook_manager import attach_user_hooks_to_crawler , UserHookManager
643+ hook_manager = UserHookManager (timeout = hooks_config .get ('timeout' , 30 ))
644+ hooks_status , hook_manager = await attach_user_hooks_to_crawler (
645+ crawler ,
646+ hooks_config .get ('code' , {}),
647+ timeout = hooks_config .get ('timeout' , 30 ),
648+ hook_manager = hook_manager
649+ )
650+ logger .info (f"Hooks attachment status for streaming: { hooks_status ['status' ]} " )
651+ # Include hook manager in hooks_info for proper tracking
652+ hooks_info = {'status' : hooks_status , 'manager' : hook_manager }
568653
569654 results_gen = await crawler .arun_many (
570655 urls = urls ,
571656 config = crawler_config ,
572657 dispatcher = dispatcher
573658 )
574659
575- return crawler , results_gen
660+ return crawler , results_gen , hooks_info
576661
577662 except Exception as e :
578663 # Make sure to close crawler if started during an error here
0 commit comments