@@ -102,16 +102,16 @@ async def smart_blog_crawler():
102102
103103 # Step 2: Configure discovery - let's find all blog posts
104104 config = SeedingConfig(
105- source = " sitemap" , # Use the website's sitemap
106- pattern = " */blog/*.html " , # Only blog posts
105+ source = " sitemap+cc " , # Use the website's sitemap+cc
106+ pattern = " */courses/* " , # Only courses related posts
107107 extract_head = True , # Get page metadata
108108 max_urls = 100 # Limit for this example
109109 )
110110
111111 # Step 3: Discover URLs from the Python blog
112- print (" 🔍 Discovering blog posts..." )
112+ print (" 🔍 Discovering course posts..." )
113113 urls = await seeder.urls(" realpython.com" , config)
114- print (f " ✅ Found { len (urls)} blog posts " )
114+ print (f " ✅ Found { len (urls)} course posts " )
115115
116116 # Step 4: Filter for Python tutorials (using metadata!)
117117 tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
134134 async with AsyncWebCrawler() as crawler:
135135 config = CrawlerRunConfig(
136136 only_text = True ,
137- word_count_threshold = 300 # Only substantial articles
137+ word_count_threshold = 300 , # Only substantial articles
138+ stream = True
138139 )
139140
140141 # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
155156
156157** What just happened?**
157158
158- 1 . We discovered all blog URLs from the sitemap
159+ 1 . We discovered all blog URLs from the sitemap+cc
1591602 . We filtered using metadata (no crawling needed!)
1601613 . We crawled only the relevant tutorials
1611624 . We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
282283 live_check = True , # Verify each URL is accessible
283284 concurrency = 20 # Check 20 URLs in parallel
284285)
285-
286- urls = await seeder.urls(" example.com" , config)
286+ async with AsyncUrlSeeder() as seeder:
287+ urls = await seeder.urls(" example.com" , config)
287288
288289# Now you can filter by status
289290live_urls = [u for u in urls if u[" status" ] == " valid" ]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
311312config = SeedingConfig(
312313 extract_head = True # Extract metadata from <head> section
313314)
314-
315- urls = await seeder.urls(" example.com" , config)
315+ async with AsyncUrlSeeder() as seeder:
316+ urls = await seeder.urls(" example.com" , config)
316317
317318# Now each URL has rich metadata
318319for url in urls[:3 ]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
387388 scoring_method = " bm25" ,
388389 score_threshold = 0.3
389390)
390-
391- urls = await seeder.urls(" example.com" , config)
391+ async with AsyncUrlSeeder() as seeder:
392+ urls = await seeder.urls(" example.com" , config)
392393
393394# URLs are scored based on:
394395# 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
429430 extract_head = True ,
430431 live_check = True
431432)
432-
433- urls = await seeder.urls(" blog.example.com" , config)
433+ async with AsyncUrlSeeder() as seeder:
434+ urls = await seeder.urls(" blog.example.com" , config)
434435
435436# Analyze the results
436437for url in urls[:5 ]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
488489 scoring_method = " bm25" , # Use BM25 algorithm
489490 score_threshold = 0.3 # Minimum relevance score
490491)
491-
492- urls = await seeder.urls(" realpython.com" , config)
492+ async with AsyncUrlSeeder() as seeder:
493+ urls = await seeder.urls(" realpython.com" , config)
493494
494495# Results are automatically sorted by relevance!
495496for url in urls[:5 ]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
511512 score_threshold = 0.5 ,
512513 max_urls = 20
513514)
514-
515- urls = await seeder.urls(" docs.example.com" , config)
515+ async with AsyncUrlSeeder() as seeder:
516+ urls = await seeder.urls(" docs.example.com" , config)
516517
517518# The highest scoring URLs will be API docs!
518519```
@@ -529,8 +530,8 @@ config = SeedingConfig(
529530 score_threshold = 0.4 ,
530531 pattern = " */product/*" # Combine with pattern matching
531532)
532-
533- urls = await seeder.urls(" shop.example.com" , config)
533+ async with AsyncUrlSeeder() as seeder:
534+ urls = await seeder.urls(" shop.example.com" , config)
534535
535536# Filter further by price (from metadata)
536537affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
550551 scoring_method = " bm25" ,
551552 score_threshold = 0.35
552553)
553-
554- urls = await seeder.urls(" technews.com" , config)
554+ async with AsyncUrlSeeder() as seeder:
555+ urls = await seeder.urls(" technews.com" , config)
555556
556557# Filter by date
557558from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
591592 score_threshold = 0.4 ,
592593 max_urls = 10 # Top 10 per topic
593594 )
594-
595- urls = await seeder.urls(" learning-platform.com" , config)
595+ async with AsyncUrlSeeder() as seeder:
596+ urls = await seeder.urls(" learning-platform.com" , config)
596597 all_tutorials.extend(urls)
597598
598599# Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
625626)
626627
627628# Returns a dictionary: {domain: [urls]}
628- results = await seeder.many_urls(domains, config)
629+ async with AsyncUrlSeeder() as seeder:
630+ results = await seeder.many_urls(domains, config)
629631
630632# Process results
631633for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
654656 pattern = " */blog/*" ,
655657 max_urls = 100
656658)
657-
658- results = await seeder.many_urls(competitors, config)
659+ async with AsyncUrlSeeder() as seeder:
660+ results = await seeder.many_urls(competitors, config)
659661
660662# Analyze content types
661663for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
690692 score_threshold = 0.3 ,
691693 max_urls = 20 # Per site
692694)
693-
694- results = await seeder.many_urls(educational_sites, config)
695+ async with AsyncUrlSeeder() as seeder:
696+ results = await seeder.many_urls(educational_sites, config)
695697
696698# Find the best beginner tutorials
697699all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
731733 score_threshold = 0.5 , # High threshold for relevance
732734 max_urls = 10
733735)
734-
735- results = await seeder.many_urls(news_sites, config)
736+ async with AsyncUrlSeeder() as seeder:
737+ results = await seeder.many_urls(news_sites, config)
736738
737739# Collect all mentions
738740mentions = []
0 commit comments