1+ #!/usr/bin/env python3
2+ """
3+ Simple test to verify BestFirstCrawlingStrategy fixes.
4+ This test crawls a real website and shows that:
5+ 1. Higher-scoring pages are crawled first (priority queue fix)
6+ 2. Links are scored before truncation (link discovery fix)
7+ """
8+
9+ import asyncio
10+ from crawl4ai import AsyncWebCrawler , BrowserConfig , CrawlerRunConfig
11+ from crawl4ai .deep_crawling import BestFirstCrawlingStrategy
12+ from crawl4ai .deep_crawling .scorers import KeywordRelevanceScorer
13+
14+ async def test_best_first_strategy ():
15+ """Test BestFirstCrawlingStrategy with keyword scoring"""
16+
17+ print ("=" * 70 )
18+ print ("Testing BestFirstCrawlingStrategy with Real URL" )
19+ print ("=" * 70 )
20+ print ("\n This test will:" )
21+ print ("1. Crawl Python.org documentation" )
22+ print ("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'" )
23+ print ("3. Show that higher-scoring pages are crawled first" )
24+ print ("-" * 70 )
25+
26+ # Create a keyword scorer that prioritizes tutorial/guide pages
27+ scorer = KeywordRelevanceScorer (
28+ keywords = ["tutorial" , "guide" , "reference" , "documentation" ],
29+ weight = 1.0 ,
30+ case_sensitive = False
31+ )
32+
33+ # Create the strategy with scoring
34+ strategy = BestFirstCrawlingStrategy (
35+ max_depth = 2 , # Crawl 2 levels deep
36+ max_pages = 10 , # Limit to 10 pages total
37+ url_scorer = scorer , # Use keyword scoring
38+ include_external = False # Only internal links
39+ )
40+
41+ # Configure browser and crawler
42+ browser_config = BrowserConfig (
43+ headless = True , # Run in background
44+ verbose = False # Reduce output noise
45+ )
46+
47+ crawler_config = CrawlerRunConfig (
48+ deep_crawl_strategy = strategy ,
49+ verbose = False
50+ )
51+
52+ print ("\n Starting crawl of https://docs.python.org/3/" )
53+ print ("Looking for pages with keywords: tutorial, guide, reference, documentation" )
54+ print ("-" * 70 )
55+
56+ crawled_urls = []
57+
58+ async with AsyncWebCrawler (config = browser_config ) as crawler :
59+ # Crawl and collect results
60+ results = await crawler .arun (
61+ url = "https://docs.python.org/3/" ,
62+ config = crawler_config
63+ )
64+
65+ # Process results
66+ if isinstance (results , list ):
67+ for result in results :
68+ score = result .metadata .get ('score' , 0 ) if result .metadata else 0
69+ depth = result .metadata .get ('depth' , 0 ) if result .metadata else 0
70+ crawled_urls .append ({
71+ 'url' : result .url ,
72+ 'score' : score ,
73+ 'depth' : depth ,
74+ 'success' : result .success
75+ })
76+
77+ print ("\n " + "=" * 70 )
78+ print ("CRAWL RESULTS (in order of crawling)" )
79+ print ("=" * 70 )
80+
81+ for i , item in enumerate (crawled_urls , 1 ):
82+ status = "✓" if item ['success' ] else "✗"
83+ # Highlight high-scoring pages
84+ if item ['score' ] > 0.5 :
85+ print (f"{ i :2} . [{ status } ] Score: { item ['score' ]:.2f} | Depth: { item ['depth' ]} | { item ['url' ]} " )
86+ print (f" ^ HIGH SCORE - Contains keywords!" )
87+ else :
88+ print (f"{ i :2} . [{ status } ] Score: { item ['score' ]:.2f} | Depth: { item ['depth' ]} | { item ['url' ]} " )
89+
90+ print ("\n " + "=" * 70 )
91+ print ("ANALYSIS" )
92+ print ("=" * 70 )
93+
94+ # Check if higher scores appear early in the crawl
95+ scores = [item ['score' ] for item in crawled_urls [1 :]] # Skip initial URL
96+ high_score_indices = [i for i , s in enumerate (scores ) if s > 0.3 ]
97+
98+ if high_score_indices and high_score_indices [0 ] < len (scores ) / 2 :
99+ print ("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!" )
100+ print (" This confirms the priority queue fix is working." )
101+ else :
102+ print ("⚠️ Check the crawl order above - higher scores should appear early" )
103+
104+ # Show score distribution
105+ print (f"\n Score Statistics:" )
106+ print (f" - Total pages crawled: { len (crawled_urls )} " )
107+ print (f" - Average score: { sum (item ['score' ] for item in crawled_urls ) / len (crawled_urls ):.2f} " )
108+ print (f" - Max score: { max (item ['score' ] for item in crawled_urls ):.2f} " )
109+ print (f" - Pages with keywords: { sum (1 for item in crawled_urls if item ['score' ] > 0.3 )} " )
110+
111+ print ("\n " + "=" * 70 )
112+ print ("TEST COMPLETE" )
113+ print ("=" * 70 )
114+
115+ if __name__ == "__main__" :
116+ print ("\n 🔍 BestFirstCrawlingStrategy Simple Test\n " )
117+ asyncio .run (test_best_first_strategy ())
0 commit comments