Skip to content

Commit 69961cf

Browse files
committed
Merge branch 'develop' of https://github.com/unclecode/crawl4ai into develop
2 parents 9447054 + ef174a4 commit 69961cf

File tree

3 files changed

+37
-35
lines changed

3 files changed

+37
-35
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ async def main():
373373

374374
async with AsyncWebCrawler(config=browser_config) as crawler:
375375
result = await crawler.arun(
376-
url="https://docs.micronaut.io/4.7.6/guide/",
376+
url="https://docs.micronaut.io/4.9.9/guide/",
377377
config=run_config
378378
)
379379
print(len(result.markdown.raw_markdown))
@@ -425,7 +425,7 @@ async def main():
425425
"type": "attribute",
426426
"attribute": "src"
427427
}
428-
}
428+
]
429429
}
430430

431431
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

deploy/docker/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import List, Tuple, Dict
55
from functools import partial
66
from uuid import uuid4
7-
from datetime import datetime
7+
from datetime import datetime, timezone
88
from base64 import b64encode
99

1010
import logging
@@ -576,7 +576,7 @@ async def handle_crawl_job(
576576
task_id = f"crawl_{uuid4().hex[:8]}"
577577
await redis.hset(f"task:{task_id}", mapping={
578578
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
579-
"created_at": datetime.utcnow().isoformat(),
579+
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
580580
"url": json.dumps(urls), # store list as JSON string
581581
"result": "",
582582
"error": "",

docs/md_v2/core/url-seeding.md

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
102102

103103
# Step 2: Configure discovery - let's find all blog posts
104104
config = SeedingConfig(
105-
source="sitemap", # Use the website's sitemap
106-
pattern="*/blog/*.html", # Only blog posts
105+
source="sitemap+cc", # Use the website's sitemap+cc
106+
pattern="*/courses/*", # Only courses related posts
107107
extract_head=True, # Get page metadata
108108
max_urls=100 # Limit for this example
109109
)
110110

111111
# Step 3: Discover URLs from the Python blog
112-
print("🔍 Discovering blog posts...")
112+
print("🔍 Discovering course posts...")
113113
urls = await seeder.urls("realpython.com", config)
114-
print(f"✅ Found {len(urls)} blog posts")
114+
print(f"✅ Found {len(urls)} course posts")
115115

116116
# Step 4: Filter for Python tutorials (using metadata!)
117117
tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
134134
async with AsyncWebCrawler() as crawler:
135135
config = CrawlerRunConfig(
136136
only_text=True,
137-
word_count_threshold=300 # Only substantial articles
137+
word_count_threshold=300, # Only substantial articles
138+
stream=True
138139
)
139140

140141
# Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
155156

156157
**What just happened?**
157158

158-
1. We discovered all blog URLs from the sitemap
159+
1. We discovered all blog URLs from the sitemap+cc
159160
2. We filtered using metadata (no crawling needed!)
160161
3. We crawled only the relevant tutorials
161162
4. We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
282283
live_check=True, # Verify each URL is accessible
283284
concurrency=20 # Check 20 URLs in parallel
284285
)
285-
286-
urls = await seeder.urls("example.com", config)
286+
async with AsyncUrlSeeder() as seeder:
287+
urls = await seeder.urls("example.com", config)
287288

288289
# Now you can filter by status
289290
live_urls = [u for u in urls if u["status"] == "valid"]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
311312
config = SeedingConfig(
312313
extract_head=True # Extract metadata from <head> section
313314
)
314-
315-
urls = await seeder.urls("example.com", config)
315+
async with AsyncUrlSeeder() as seeder:
316+
urls = await seeder.urls("example.com", config)
316317

317318
# Now each URL has rich metadata
318319
for url in urls[:3]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
387388
scoring_method="bm25",
388389
score_threshold=0.3
389390
)
390-
391-
urls = await seeder.urls("example.com", config)
391+
async with AsyncUrlSeeder() as seeder:
392+
urls = await seeder.urls("example.com", config)
392393

393394
# URLs are scored based on:
394395
# 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
429430
extract_head=True,
430431
live_check=True
431432
)
432-
433-
urls = await seeder.urls("blog.example.com", config)
433+
async with AsyncUrlSeeder() as seeder:
434+
urls = await seeder.urls("blog.example.com", config)
434435

435436
# Analyze the results
436437
for url in urls[:5]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
488489
scoring_method="bm25", # Use BM25 algorithm
489490
score_threshold=0.3 # Minimum relevance score
490491
)
491-
492-
urls = await seeder.urls("realpython.com", config)
492+
async with AsyncUrlSeeder() as seeder:
493+
urls = await seeder.urls("realpython.com", config)
493494

494495
# Results are automatically sorted by relevance!
495496
for url in urls[:5]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
511512
score_threshold=0.5,
512513
max_urls=20
513514
)
514-
515-
urls = await seeder.urls("docs.example.com", config)
515+
async with AsyncUrlSeeder() as seeder:
516+
urls = await seeder.urls("docs.example.com", config)
516517

517518
# The highest scoring URLs will be API docs!
518519
```
@@ -529,8 +530,8 @@ config = SeedingConfig(
529530
score_threshold=0.4,
530531
pattern="*/product/*" # Combine with pattern matching
531532
)
532-
533-
urls = await seeder.urls("shop.example.com", config)
533+
async with AsyncUrlSeeder() as seeder:
534+
urls = await seeder.urls("shop.example.com", config)
534535

535536
# Filter further by price (from metadata)
536537
affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
550551
scoring_method="bm25",
551552
score_threshold=0.35
552553
)
553-
554-
urls = await seeder.urls("technews.com", config)
554+
async with AsyncUrlSeeder() as seeder:
555+
urls = await seeder.urls("technews.com", config)
555556

556557
# Filter by date
557558
from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
591592
score_threshold=0.4,
592593
max_urls=10 # Top 10 per topic
593594
)
594-
595-
urls = await seeder.urls("learning-platform.com", config)
595+
async with AsyncUrlSeeder() as seeder:
596+
urls = await seeder.urls("learning-platform.com", config)
596597
all_tutorials.extend(urls)
597598

598599
# Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
625626
)
626627

627628
# Returns a dictionary: {domain: [urls]}
628-
results = await seeder.many_urls(domains, config)
629+
async with AsyncUrlSeeder() as seeder:
630+
results = await seeder.many_urls(domains, config)
629631

630632
# Process results
631633
for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
654656
pattern="*/blog/*",
655657
max_urls=100
656658
)
657-
658-
results = await seeder.many_urls(competitors, config)
659+
async with AsyncUrlSeeder() as seeder:
660+
results = await seeder.many_urls(competitors, config)
659661

660662
# Analyze content types
661663
for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
690692
score_threshold=0.3,
691693
max_urls=20 # Per site
692694
)
693-
694-
results = await seeder.many_urls(educational_sites, config)
695+
async with AsyncUrlSeeder() as seeder:
696+
results = await seeder.many_urls(educational_sites, config)
695697

696698
# Find the best beginner tutorials
697699
all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
731733
score_threshold=0.5, # High threshold for relevance
732734
max_urls=10
733735
)
734-
735-
results = await seeder.many_urls(news_sites, config)
736+
async with AsyncUrlSeeder() as seeder:
737+
results = await seeder.many_urls(news_sites, config)
736738

737739
# Collect all mentions
738740
mentions = []

0 commit comments

Comments
 (0)