Skip to content

Commit 4e1c4bd

Browse files
authored
Merge pull request unclecode#1436 from unclecode/fix/docker-filter
fix(docker): resolve filter serialization and JSON encoding errors in deep crawl strategy
2 parents cce3390 + 38f3ea4 commit 4e1c4bd

File tree

6 files changed

+243
-9
lines changed

6 files changed

+243
-9
lines changed

crawl4ai/async_configs.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
9797
if value != param.default and not ignore_default_value:
9898
current_values[name] = to_serializable_dict(value)
9999

100-
if hasattr(obj, '__slots__'):
101-
for slot in obj.__slots__:
102-
if slot.startswith('_'): # Handle private slots
103-
attr_name = slot[1:] # Remove leading '_'
104-
value = getattr(obj, slot, None)
105-
if value is not None:
106-
current_values[attr_name] = to_serializable_dict(value)
100+
# Don't serialize private __slots__ - they're internal implementation details
101+
# not constructor parameters. This was causing URLPatternFilter to fail
102+
# because _simple_suffixes was being serialized as 'simple_suffixes'
103+
# if hasattr(obj, '__slots__'):
104+
# for slot in obj.__slots__:
105+
# if slot.startswith('_'): # Handle private slots
106+
# attr_name = slot[1:] # Remove leading '_'
107+
# value = getattr(obj, slot, None)
108+
# if value is not None:
109+
# current_values[attr_name] = to_serializable_dict(value)
107110

108111

109112

crawl4ai/deep_crawling/bff_strategy.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,13 @@ def __init__(
4747
self.url_scorer = url_scorer
4848
self.include_external = include_external
4949
self.max_pages = max_pages
50-
self.logger = logger or logging.getLogger(__name__)
50+
# self.logger = logger or logging.getLogger(__name__)
51+
# Ensure logger is always a Logger instance, not a dict from serialization
52+
if isinstance(logger, logging.Logger):
53+
self.logger = logger
54+
else:
55+
# Create a new logger if logger is None, dict, or any other non-Logger type
56+
self.logger = logging.getLogger(__name__)
5157
self.stats = TraversalStats(start_time=datetime.now())
5258
self._cancel_event = asyncio.Event()
5359
self._pages_crawled = 0

crawl4ai/deep_crawling/bfs_strategy.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@ def __init__(
3838
self.include_external = include_external
3939
self.score_threshold = score_threshold
4040
self.max_pages = max_pages
41-
self.logger = logger or logging.getLogger(__name__)
41+
# self.logger = logger or logging.getLogger(__name__)
42+
# Ensure logger is always a Logger instance, not a dict from serialization
43+
if isinstance(logger, logging.Logger):
44+
self.logger = logger
45+
else:
46+
# Create a new logger if logger is None, dict, or any other non-Logger type
47+
self.logger = logging.getLogger(__name__)
4248
self.stats = TraversalStats(start_time=datetime.now())
4349
self._cancel_event = asyncio.Event()
4450
self._pages_crawled = 0

crawl4ai/deep_crawling/filters.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
120120
"""Pattern filter balancing speed and completeness"""
121121

122122
__slots__ = (
123+
"patterns", # Store original patterns for serialization
124+
"use_glob", # Store original use_glob for serialization
125+
"reverse", # Store original reverse for serialization
123126
"_simple_suffixes",
124127
"_simple_prefixes",
125128
"_domain_patterns",
@@ -142,6 +145,11 @@ def __init__(
142145
reverse: bool = False,
143146
):
144147
super().__init__()
148+
# Store original constructor params for serialization
149+
self.patterns = patterns
150+
self.use_glob = use_glob
151+
self.reverse = reverse
152+
145153
self._reverse = reverse
146154
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
147155

crawl4ai/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,16 @@ def model_dump(self, *args, **kwargs):
253253
requirements change, this is where you would update the logic.
254254
"""
255255
result = super().model_dump(*args, **kwargs)
256+
257+
# Remove any property descriptors that might have been included
258+
# These deprecated properties should not be in the serialized output
259+
for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
260+
if key in result and isinstance(result[key], property):
261+
# del result[key]
262+
# Nasrin: I decided to convert it to string instead of removing it.
263+
result[key] = str(result[key])
264+
265+
# Add the markdown field properly
256266
if self._markdown is not None:
257267
result["markdown"] = self._markdown.model_dump()
258268
return result
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
"""
2+
Test the complete fix for both the filter serialization and JSON serialization issues.
3+
"""
4+
5+
import asyncio
6+
import httpx
7+
8+
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
9+
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
10+
11+
BASE_URL = "http://localhost:11234/" # Adjust port as needed
12+
13+
async def test_with_docker_client():
14+
"""Test using the Docker client (same as 1419.py)."""
15+
from crawl4ai.docker_client import Crawl4aiDockerClient
16+
17+
print("=" * 60)
18+
print("Testing with Docker Client")
19+
print("=" * 60)
20+
21+
try:
22+
async with Crawl4aiDockerClient(
23+
base_url=BASE_URL,
24+
verbose=True,
25+
) as client:
26+
27+
# Create filter chain - testing the serialization fix
28+
filter_chain = [
29+
URLPatternFilter(
30+
# patterns=["*about*", "*privacy*", "*terms*"],
31+
patterns=["*advanced*"],
32+
reverse=True
33+
),
34+
]
35+
36+
crawler_config = CrawlerRunConfig(
37+
deep_crawl_strategy=BFSDeepCrawlStrategy(
38+
max_depth=2, # Keep it shallow for testing
39+
# max_pages=5, # Limit pages for testing
40+
filter_chain=FilterChain(filter_chain)
41+
),
42+
cache_mode=CacheMode.BYPASS,
43+
)
44+
45+
print("\n1. Testing crawl with filters...")
46+
results = await client.crawl(
47+
["https://docs.crawl4ai.com"], # Simple test page
48+
browser_config=BrowserConfig(headless=True),
49+
crawler_config=crawler_config,
50+
)
51+
52+
if results:
53+
print(f"✅ Crawl succeeded! Type: {type(results)}")
54+
if hasattr(results, 'success'):
55+
print(f"✅ Results success: {results.success}")
56+
# Test that we can iterate results without JSON errors
57+
if hasattr(results, '__iter__'):
58+
for i, result in enumerate(results):
59+
if hasattr(result, 'url'):
60+
print(f" Result {i}: {result.url[:50]}...")
61+
else:
62+
print(f" Result {i}: {str(result)[:50]}...")
63+
else:
64+
# Handle list of results
65+
print(f"✅ Got {len(results)} results")
66+
for i, result in enumerate(results[:3]): # Show first 3
67+
print(f" Result {i}: {result.url[:50]}...")
68+
else:
69+
print("❌ Crawl failed - no results returned")
70+
return False
71+
72+
print("\n✅ Docker client test completed successfully!")
73+
return True
74+
75+
except Exception as e:
76+
print(f"❌ Docker client test failed: {e}")
77+
import traceback
78+
traceback.print_exc()
79+
return False
80+
81+
82+
async def test_with_rest_api():
83+
"""Test using REST API directly."""
84+
print("\n" + "=" * 60)
85+
print("Testing with REST API")
86+
print("=" * 60)
87+
88+
# Create filter configuration
89+
deep_crawl_strategy_payload = {
90+
"type": "BFSDeepCrawlStrategy",
91+
"params": {
92+
"max_depth": 2,
93+
# "max_pages": 5,
94+
"filter_chain": {
95+
"type": "FilterChain",
96+
"params": {
97+
"filters": [
98+
{
99+
"type": "URLPatternFilter",
100+
"params": {
101+
"patterns": ["*advanced*"],
102+
"reverse": True
103+
}
104+
}
105+
]
106+
}
107+
}
108+
}
109+
}
110+
111+
crawl_payload = {
112+
"urls": ["https://docs.crawl4ai.com"],
113+
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
114+
"crawler_config": {
115+
"type": "CrawlerRunConfig",
116+
"params": {
117+
"deep_crawl_strategy": deep_crawl_strategy_payload,
118+
"cache_mode": "bypass"
119+
}
120+
}
121+
}
122+
123+
try:
124+
async with httpx.AsyncClient() as client:
125+
print("\n1. Sending crawl request to REST API...")
126+
response = await client.post(
127+
f"{BASE_URL}crawl",
128+
json=crawl_payload,
129+
timeout=30
130+
)
131+
132+
if response.status_code == 200:
133+
print(f"✅ REST API returned 200 OK")
134+
data = response.json()
135+
if data.get("success"):
136+
results = data.get("results", [])
137+
print(f"✅ Got {len(results)} results")
138+
for i, result in enumerate(results[:3]):
139+
print(f" Result {i}: {result.get('url', 'unknown')[:50]}...")
140+
else:
141+
print(f"❌ Crawl not successful: {data}")
142+
return False
143+
else:
144+
print(f"❌ REST API returned {response.status_code}")
145+
print(f" Response: {response.text[:500]}")
146+
return False
147+
148+
print("\n✅ REST API test completed successfully!")
149+
return True
150+
151+
except Exception as e:
152+
print(f"❌ REST API test failed: {e}")
153+
import traceback
154+
traceback.print_exc()
155+
return False
156+
157+
158+
async def main():
159+
"""Run all tests."""
160+
print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
161+
print("=" * 60)
162+
print("Make sure the server is running with the updated code!")
163+
print("=" * 60)
164+
165+
results = []
166+
167+
# Test 1: Docker client
168+
docker_passed = await test_with_docker_client()
169+
results.append(("Docker Client", docker_passed))
170+
171+
# Test 2: REST API
172+
rest_passed = await test_with_rest_api()
173+
results.append(("REST API", rest_passed))
174+
175+
# Summary
176+
print("\n" + "=" * 60)
177+
print("FINAL TEST SUMMARY")
178+
print("=" * 60)
179+
180+
all_passed = True
181+
for test_name, passed in results:
182+
status = "✅ PASSED" if passed else "❌ FAILED"
183+
print(f"{test_name:20} {status}")
184+
if not passed:
185+
all_passed = False
186+
187+
print("=" * 60)
188+
if all_passed:
189+
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
190+
print("\nThe fixes:")
191+
print("1. Filter serialization: Fixed by not serializing private __slots__")
192+
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
193+
else:
194+
print("⚠️ Some tests failed. Please check the server logs for details.")
195+
196+
return 0 if all_passed else 1
197+
198+
199+
if __name__ == "__main__":
200+
import sys
201+
sys.exit(asyncio.run(main()))

0 commit comments

Comments
 (0)