Skip to content

Commit 102352e

Browse files
committed
fix(docker): resolve filter serialization and JSON encoding errors in deep crawl strategy (ref unclecode#1419)
- Fix URLPatternFilter serialization by preventing private __slots__ from being serialized as constructor params - Add public attributes to URLPatternFilter to store original constructor parameters for proper serialization - Handle property descriptors in CrawlResult.model_dump() to prevent JSON serialization errors - Ensure filter chains work correctly with Docker client and REST API The issue occurred because: 1. Private implementation details (_simple_suffixes, etc.) were being serialized and passed as constructor arguments during deserialization 2. Property descriptors were being included in the serialized output, causing "Object of type property is not JSON serializable" errors Changes: - async_configs.py: Comment out __slots__ serialization logic (lines 100-109) - filters.py: Add patterns, use_glob, reverse to URLPatternFilter __slots__ and store as public attributes - models.py: Convert property descriptors to strings in model_dump() instead of including them directly
1 parent ef174a4 commit 102352e

File tree

4 files changed

+229
-7
lines changed

4 files changed

+229
-7
lines changed

crawl4ai/async_configs.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
9797
if value != param.default and not ignore_default_value:
9898
current_values[name] = to_serializable_dict(value)
9999

100-
if hasattr(obj, '__slots__'):
101-
for slot in obj.__slots__:
102-
if slot.startswith('_'): # Handle private slots
103-
attr_name = slot[1:] # Remove leading '_'
104-
value = getattr(obj, slot, None)
105-
if value is not None:
106-
current_values[attr_name] = to_serializable_dict(value)
100+
# Don't serialize private __slots__ - they're internal implementation details
101+
# not constructor parameters. This was causing URLPatternFilter to fail
102+
# because _simple_suffixes was being serialized as 'simple_suffixes'
103+
# if hasattr(obj, '__slots__'):
104+
# for slot in obj.__slots__:
105+
# if slot.startswith('_'): # Handle private slots
106+
# attr_name = slot[1:] # Remove leading '_'
107+
# value = getattr(obj, slot, None)
108+
# if value is not None:
109+
# current_values[attr_name] = to_serializable_dict(value)
107110

108111

109112

crawl4ai/deep_crawling/filters.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
120120
"""Pattern filter balancing speed and completeness"""
121121

122122
__slots__ = (
123+
"patterns", # Store original patterns for serialization
124+
"use_glob", # Store original use_glob for serialization
125+
"reverse", # Store original reverse for serialization
123126
"_simple_suffixes",
124127
"_simple_prefixes",
125128
"_domain_patterns",
@@ -142,6 +145,11 @@ def __init__(
142145
reverse: bool = False,
143146
):
144147
super().__init__()
148+
# Store original constructor params for serialization
149+
self.patterns = patterns
150+
self.use_glob = use_glob
151+
self.reverse = reverse
152+
145153
self._reverse = reverse
146154
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
147155

crawl4ai/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,16 @@ def model_dump(self, *args, **kwargs):
253253
requirements change, this is where you would update the logic.
254254
"""
255255
result = super().model_dump(*args, **kwargs)
256+
257+
# Remove any property descriptors that might have been included
258+
# These deprecated properties should not be in the serialized output
259+
for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
260+
if key in result and isinstance(result[key], property):
261+
# del result[key]
262+
# Nasrin: I decided to convert it to string instead of removing it.
263+
result[key] = str(result[key])
264+
265+
# Add the markdown field properly
256266
if self._markdown is not None:
257267
result["markdown"] = self._markdown.model_dump()
258268
return result
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
"""
2+
Test the complete fix for both the filter serialization and JSON serialization issues.
3+
"""
4+
5+
import asyncio
6+
import httpx
7+
8+
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
9+
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
10+
11+
BASE_URL = "http://localhost:11234/" # Adjust port as needed
12+
13+
async def test_with_docker_client():
14+
"""Test using the Docker client (same as 1419.py)."""
15+
from crawl4ai.docker_client import Crawl4aiDockerClient
16+
17+
print("=" * 60)
18+
print("Testing with Docker Client")
19+
print("=" * 60)
20+
21+
try:
22+
async with Crawl4aiDockerClient(
23+
base_url=BASE_URL,
24+
verbose=True,
25+
) as client:
26+
27+
# Create filter chain - testing the serialization fix
28+
filter_chain = [
29+
URLPatternFilter(
30+
# patterns=["*about*", "*privacy*", "*terms*"],
31+
patterns=["*advanced*"],
32+
reverse=True
33+
),
34+
]
35+
36+
crawler_config = CrawlerRunConfig(
37+
deep_crawl_strategy=BFSDeepCrawlStrategy(
38+
max_depth=2, # Keep it shallow for testing
39+
# max_pages=5, # Limit pages for testing
40+
filter_chain=FilterChain(filter_chain)
41+
),
42+
cache_mode=CacheMode.BYPASS,
43+
)
44+
45+
print("\n1. Testing crawl with filters...")
46+
results = await client.crawl(
47+
["https://docs.crawl4ai.com"], # Simple test page
48+
browser_config=BrowserConfig(headless=True),
49+
crawler_config=crawler_config,
50+
)
51+
52+
if results:
53+
print(f"✅ Crawl succeeded! Type: {type(results)}")
54+
if hasattr(results, 'success'):
55+
print(f"✅ Results success: {results.success}")
56+
# Test that we can iterate results without JSON errors
57+
if hasattr(results, '__iter__'):
58+
for i, result in enumerate(results):
59+
if hasattr(result, 'url'):
60+
print(f" Result {i}: {result.url[:50]}...")
61+
else:
62+
print(f" Result {i}: {str(result)[:50]}...")
63+
else:
64+
# Handle list of results
65+
print(f"✅ Got {len(results)} results")
66+
for i, result in enumerate(results[:3]): # Show first 3
67+
print(f" Result {i}: {result.url[:50]}...")
68+
else:
69+
print("❌ Crawl failed - no results returned")
70+
return False
71+
72+
print("\n✅ Docker client test completed successfully!")
73+
return True
74+
75+
except Exception as e:
76+
print(f"❌ Docker client test failed: {e}")
77+
import traceback
78+
traceback.print_exc()
79+
return False
80+
81+
82+
async def test_with_rest_api():
83+
"""Test using REST API directly."""
84+
print("\n" + "=" * 60)
85+
print("Testing with REST API")
86+
print("=" * 60)
87+
88+
# Create filter configuration
89+
deep_crawl_strategy_payload = {
90+
"type": "BFSDeepCrawlStrategy",
91+
"params": {
92+
"max_depth": 2,
93+
# "max_pages": 5,
94+
"filter_chain": {
95+
"type": "FilterChain",
96+
"params": {
97+
"filters": [
98+
{
99+
"type": "URLPatternFilter",
100+
"params": {
101+
"patterns": ["*advanced*"],
102+
"reverse": True
103+
}
104+
}
105+
]
106+
}
107+
}
108+
}
109+
}
110+
111+
crawl_payload = {
112+
"urls": ["https://docs.crawl4ai.com"],
113+
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
114+
"crawler_config": {
115+
"type": "CrawlerRunConfig",
116+
"params": {
117+
"deep_crawl_strategy": deep_crawl_strategy_payload,
118+
"cache_mode": "bypass"
119+
}
120+
}
121+
}
122+
123+
try:
124+
async with httpx.AsyncClient() as client:
125+
print("\n1. Sending crawl request to REST API...")
126+
response = await client.post(
127+
f"{BASE_URL}crawl",
128+
json=crawl_payload,
129+
timeout=30
130+
)
131+
132+
if response.status_code == 200:
133+
print(f"✅ REST API returned 200 OK")
134+
data = response.json()
135+
if data.get("success"):
136+
results = data.get("results", [])
137+
print(f"✅ Got {len(results)} results")
138+
for i, result in enumerate(results[:3]):
139+
print(f" Result {i}: {result.get('url', 'unknown')[:50]}...")
140+
else:
141+
print(f"❌ Crawl not successful: {data}")
142+
return False
143+
else:
144+
print(f"❌ REST API returned {response.status_code}")
145+
print(f" Response: {response.text[:500]}")
146+
return False
147+
148+
print("\n✅ REST API test completed successfully!")
149+
return True
150+
151+
except Exception as e:
152+
print(f"❌ REST API test failed: {e}")
153+
import traceback
154+
traceback.print_exc()
155+
return False
156+
157+
158+
async def main():
159+
"""Run all tests."""
160+
print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
161+
print("=" * 60)
162+
print("Make sure the server is running with the updated code!")
163+
print("=" * 60)
164+
165+
results = []
166+
167+
# Test 1: Docker client
168+
docker_passed = await test_with_docker_client()
169+
results.append(("Docker Client", docker_passed))
170+
171+
# Test 2: REST API
172+
rest_passed = await test_with_rest_api()
173+
results.append(("REST API", rest_passed))
174+
175+
# Summary
176+
print("\n" + "=" * 60)
177+
print("FINAL TEST SUMMARY")
178+
print("=" * 60)
179+
180+
all_passed = True
181+
for test_name, passed in results:
182+
status = "✅ PASSED" if passed else "❌ FAILED"
183+
print(f"{test_name:20} {status}")
184+
if not passed:
185+
all_passed = False
186+
187+
print("=" * 60)
188+
if all_passed:
189+
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
190+
print("\nThe fixes:")
191+
print("1. Filter serialization: Fixed by not serializing private __slots__")
192+
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
193+
else:
194+
print("⚠️ Some tests failed. Please check the server logs for details.")
195+
196+
return 0 if all_passed else 1
197+
198+
199+
if __name__ == "__main__":
200+
import sys
201+
sys.exit(asyncio.run(main()))

0 commit comments

Comments
 (0)