Skip to content

Commit fef715a

Browse files
committed
Merge branch 'feature/docker-hooks' into develop
2 parents 69e8ca3 + be63c98 commit fef715a

File tree

8 files changed

+2561
-26
lines changed

8 files changed

+2561
-26
lines changed

deploy/docker/api.py

Lines changed: 100 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -442,13 +442,15 @@ async def handle_crawl_request(
442442
urls: List[str],
443443
browser_config: dict,
444444
crawler_config: dict,
445-
config: dict
445+
config: dict,
446+
hooks_config: Optional[dict] = None
446447
) -> dict:
447-
"""Handle non-streaming crawl requests."""
448+
"""Handle non-streaming crawl requests with optional hooks."""
448449
start_mem_mb = _get_memory_mb() # <--- Get memory before
449450
start_time = time.time()
450451
mem_delta_mb = None
451452
peak_mem_mb = start_mem_mb
453+
hook_manager = None
452454

453455
try:
454456
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
@@ -468,6 +470,19 @@ async def handle_crawl_request(
468470
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
469471
# await crawler.start()
470472

473+
# Attach hooks if provided
474+
hooks_status = {}
475+
if hooks_config:
476+
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
477+
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
478+
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
479+
crawler,
480+
hooks_config.get('code', {}),
481+
timeout=hooks_config.get('timeout', 30),
482+
hook_manager=hook_manager
483+
)
484+
logger.info(f"Hooks attachment status: {hooks_status['status']}")
485+
471486
base_config = config["crawler"]["base_config"]
472487
# Iterate on key-value pairs in global_config then use hasattr to set them
473488
for key, value in base_config.items():
@@ -484,6 +499,10 @@ async def handle_crawl_request(
484499
config=crawler_config,
485500
dispatcher=dispatcher)
486501
results = await partial_func()
502+
503+
# Ensure results is always a list
504+
if not isinstance(results, list):
505+
results = [results]
487506

488507
# await crawler.close()
489508

@@ -498,22 +517,72 @@ async def handle_crawl_request(
498517
# Process results to handle PDF bytes
499518
processed_results = []
500519
for result in results:
501-
result_dict = result.model_dump()
502-
# if fit_html is not a string, set it to None to avoid serialization errors
503-
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
504-
result_dict["fit_html"] = None
505-
# If PDF exists, encode it to base64
506-
if result_dict.get('pdf') is not None:
507-
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
508-
processed_results.append(result_dict)
520+
try:
521+
# Check if result has model_dump method (is a proper CrawlResult)
522+
if hasattr(result, 'model_dump'):
523+
result_dict = result.model_dump()
524+
elif isinstance(result, dict):
525+
result_dict = result
526+
else:
527+
# Handle unexpected result type
528+
logger.warning(f"Unexpected result type: {type(result)}")
529+
result_dict = {
530+
"url": str(result) if hasattr(result, '__str__') else "unknown",
531+
"success": False,
532+
"error_message": f"Unexpected result type: {type(result).__name__}"
533+
}
534+
535+
# if fit_html is not a string, set it to None to avoid serialization errors
536+
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
537+
result_dict["fit_html"] = None
538+
539+
# If PDF exists, encode it to base64
540+
if result_dict.get('pdf') is not None and isinstance(result_dict.get('pdf'), bytes):
541+
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
542+
543+
processed_results.append(result_dict)
544+
except Exception as e:
545+
logger.error(f"Error processing result: {e}")
546+
processed_results.append({
547+
"url": "unknown",
548+
"success": False,
549+
"error_message": str(e)
550+
})
509551

510-
return {
552+
response = {
511553
"success": True,
512554
"results": processed_results,
513555
"server_processing_time_s": end_time - start_time,
514556
"server_memory_delta_mb": mem_delta_mb,
515557
"server_peak_memory_mb": peak_mem_mb
516558
}
559+
560+
# Add hooks information if hooks were used
561+
if hooks_config and hook_manager:
562+
from hook_manager import UserHookManager
563+
if isinstance(hook_manager, UserHookManager):
564+
try:
565+
# Ensure all hook data is JSON serializable
566+
import json
567+
hook_data = {
568+
"status": hooks_status,
569+
"execution_log": hook_manager.execution_log,
570+
"errors": hook_manager.errors,
571+
"summary": hook_manager.get_summary()
572+
}
573+
# Test that it's serializable
574+
json.dumps(hook_data)
575+
response["hooks"] = hook_data
576+
except (TypeError, ValueError) as e:
577+
logger.error(f"Hook data not JSON serializable: {e}")
578+
response["hooks"] = {
579+
"status": {"status": "error", "message": "Hook data serialization failed"},
580+
"execution_log": [],
581+
"errors": [{"error": str(e)}],
582+
"summary": {}
583+
}
584+
585+
return response
517586

518587
except Exception as e:
519588
logger.error(f"Crawl error: {str(e)}", exc_info=True)
@@ -542,9 +611,11 @@ async def handle_stream_crawl_request(
542611
urls: List[str],
543612
browser_config: dict,
544613
crawler_config: dict,
545-
config: dict
546-
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
547-
"""Handle streaming crawl requests."""
614+
config: dict,
615+
hooks_config: Optional[dict] = None
616+
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
617+
"""Handle streaming crawl requests with optional hooks."""
618+
hooks_info = None
548619
try:
549620
browser_config = BrowserConfig.load(browser_config)
550621
# browser_config.verbose = True # Set to False or remove for production stress testing
@@ -565,14 +636,28 @@ async def handle_stream_crawl_request(
565636

566637
# crawler = AsyncWebCrawler(config=browser_config)
567638
# await crawler.start()
639+
640+
# Attach hooks if provided
641+
if hooks_config:
642+
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
643+
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
644+
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
645+
crawler,
646+
hooks_config.get('code', {}),
647+
timeout=hooks_config.get('timeout', 30),
648+
hook_manager=hook_manager
649+
)
650+
logger.info(f"Hooks attachment status for streaming: {hooks_status['status']}")
651+
# Include hook manager in hooks_info for proper tracking
652+
hooks_info = {'status': hooks_status, 'manager': hook_manager}
568653

569654
results_gen = await crawler.arun_many(
570655
urls=urls,
571656
config=crawler_config,
572657
dispatcher=dispatcher
573658
)
574659

575-
return crawler, results_gen
660+
return crawler, results_gen, hooks_info
576661

577662
except Exception as e:
578663
# Make sure to close crawler if started during an error here

0 commit comments

Comments
 (0)