Skip to content

Commit 23431d8

Browse files
authored
Merge pull request unclecode#1389 from unclecode/fix/deep-crawl-scoring
fix(deep-crawl): BestFirst priority inversion
2 parents f8eaf01 + 88a9fbb commit 23431d8

File tree

2 files changed

+121
-8
lines changed

2 files changed

+121
-8
lines changed

crawl4ai/deep_crawling/bff_strategy.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,6 @@ async def link_discovery(
122122

123123
valid_links.append(base_url)
124124

125-
# If we have more valid links than capacity, limit them
126-
if len(valid_links) > remaining_capacity:
127-
valid_links = valid_links[:remaining_capacity]
128-
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
129-
130125
# Record the new depths and add to next_links
131126
for url in valid_links:
132127
depths[url] = new_depth
@@ -146,7 +141,8 @@ async def _arun_best_first(
146141
"""
147142
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
148143
# Push the initial URL with score 0 and depth 0.
149-
await queue.put((0, 0, start_url, None))
144+
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
145+
await queue.put((-initial_score, 0, start_url, None))
150146
visited: Set[str] = set()
151147
depths: Dict[str, int] = {start_url: 0}
152148

@@ -193,7 +189,7 @@ async def _arun_best_first(
193189
result.metadata = result.metadata or {}
194190
result.metadata["depth"] = depth
195191
result.metadata["parent_url"] = parent_url
196-
result.metadata["score"] = score
192+
result.metadata["score"] = -score
197193

198194
# Count only successful crawls toward max_pages limit
199195
if result.success:
@@ -214,7 +210,7 @@ async def _arun_best_first(
214210
for new_url, new_parent in new_links:
215211
new_depth = depths.get(new_url, depth + 1)
216212
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
217-
await queue.put((new_score, new_depth, new_url, new_parent))
213+
await queue.put((-new_score, new_depth, new_url, new_parent))
218214

219215
# End of crawl.
220216

tests/general/test_bff_scoring.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Simple test to verify BestFirstCrawlingStrategy fixes.
4+
This test crawls a real website and shows that:
5+
1. Higher-scoring pages are crawled first (priority queue fix)
6+
2. Links are scored before truncation (link discovery fix)
7+
"""
8+
9+
import asyncio
10+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
11+
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
12+
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
13+
14+
async def test_best_first_strategy():
15+
"""Test BestFirstCrawlingStrategy with keyword scoring"""
16+
17+
print("=" * 70)
18+
print("Testing BestFirstCrawlingStrategy with Real URL")
19+
print("=" * 70)
20+
print("\nThis test will:")
21+
print("1. Crawl Python.org documentation")
22+
print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'")
23+
print("3. Show that higher-scoring pages are crawled first")
24+
print("-" * 70)
25+
26+
# Create a keyword scorer that prioritizes tutorial/guide pages
27+
scorer = KeywordRelevanceScorer(
28+
keywords=["tutorial", "guide", "reference", "documentation"],
29+
weight=1.0,
30+
case_sensitive=False
31+
)
32+
33+
# Create the strategy with scoring
34+
strategy = BestFirstCrawlingStrategy(
35+
max_depth=2, # Crawl 2 levels deep
36+
max_pages=10, # Limit to 10 pages total
37+
url_scorer=scorer, # Use keyword scoring
38+
include_external=False # Only internal links
39+
)
40+
41+
# Configure browser and crawler
42+
browser_config = BrowserConfig(
43+
headless=True, # Run in background
44+
verbose=False # Reduce output noise
45+
)
46+
47+
crawler_config = CrawlerRunConfig(
48+
deep_crawl_strategy=strategy,
49+
verbose=False
50+
)
51+
52+
print("\nStarting crawl of https://docs.python.org/3/")
53+
print("Looking for pages with keywords: tutorial, guide, reference, documentation")
54+
print("-" * 70)
55+
56+
crawled_urls = []
57+
58+
async with AsyncWebCrawler(config=browser_config) as crawler:
59+
# Crawl and collect results
60+
results = await crawler.arun(
61+
url="https://docs.python.org/3/",
62+
config=crawler_config
63+
)
64+
65+
# Process results
66+
if isinstance(results, list):
67+
for result in results:
68+
score = result.metadata.get('score', 0) if result.metadata else 0
69+
depth = result.metadata.get('depth', 0) if result.metadata else 0
70+
crawled_urls.append({
71+
'url': result.url,
72+
'score': score,
73+
'depth': depth,
74+
'success': result.success
75+
})
76+
77+
print("\n" + "=" * 70)
78+
print("CRAWL RESULTS (in order of crawling)")
79+
print("=" * 70)
80+
81+
for i, item in enumerate(crawled_urls, 1):
82+
status = "✓" if item['success'] else "✗"
83+
# Highlight high-scoring pages
84+
if item['score'] > 0.5:
85+
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
86+
print(f" ^ HIGH SCORE - Contains keywords!")
87+
else:
88+
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
89+
90+
print("\n" + "=" * 70)
91+
print("ANALYSIS")
92+
print("=" * 70)
93+
94+
# Check if higher scores appear early in the crawl
95+
scores = [item['score'] for item in crawled_urls[1:]] # Skip initial URL
96+
high_score_indices = [i for i, s in enumerate(scores) if s > 0.3]
97+
98+
if high_score_indices and high_score_indices[0] < len(scores) / 2:
99+
print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!")
100+
print(" This confirms the priority queue fix is working.")
101+
else:
102+
print("⚠️ Check the crawl order above - higher scores should appear early")
103+
104+
# Show score distribution
105+
print(f"\nScore Statistics:")
106+
print(f" - Total pages crawled: {len(crawled_urls)}")
107+
print(f" - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}")
108+
print(f" - Max score: {max(item['score'] for item in crawled_urls):.2f}")
109+
print(f" - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}")
110+
111+
print("\n" + "=" * 70)
112+
print("TEST COMPLETE")
113+
print("=" * 70)
114+
115+
if __name__ == "__main__":
116+
print("\n🔍 BestFirstCrawlingStrategy Simple Test\n")
117+
asyncio.run(test_best_first_strategy())

0 commit comments

Comments
 (0)