Skip to content

Commit 1bd3de6

Browse files
unclecode#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking
1 parent d56b0eb commit 1bd3de6

File tree

2 files changed

+120
-0
lines changed

2 files changed

+120
-0
lines changed

crawl4ai/deep_crawling/dfs_strategy.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from ..models import CrawlResult
55
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
66
from ..types import AsyncWebCrawler, CrawlerRunConfig
7+
from ..utils import normalize_url_for_deep_crawl
78

89
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
910
"""
@@ -12,6 +13,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
1213
Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
1314
Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
1415
"""
16+
17+
def __init__(self, *args, **kwargs):
18+
super().__init__(*args, **kwargs)
19+
self._dfs_seen: Set[str] = set()
20+
21+
def _reset_seen(self, start_url: str) -> None:
22+
self._dfs_seen = {start_url}
23+
1524
async def _arun_batch(
1625
self,
1726
start_url: str,
@@ -27,6 +36,7 @@ async def _arun_batch(
2736
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
2837
depths: Dict[str, int] = {start_url: 0}
2938
results: List[CrawlResult] = []
39+
self._reset_seen(start_url)
3040

3141
while stack and not self._cancel_event.is_set():
3242
url, parent, depth = stack.pop()
@@ -77,6 +87,7 @@ async def _arun_stream(
7787
visited: Set[str] = set()
7888
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
7989
depths: Dict[str, int] = {start_url: 0}
90+
self._reset_seen(start_url)
8091

8192
while stack and not self._cancel_event.is_set():
8293
url, parent, depth = stack.pop()
@@ -108,3 +119,73 @@ async def _arun_stream(
108119
for new_url, new_parent in reversed(new_links):
109120
new_depth = depths.get(new_url, depth + 1)
110121
stack.append((new_url, new_parent, new_depth))
122+
123+
async def link_discovery(
124+
self,
125+
result: CrawlResult,
126+
source_url: str,
127+
current_depth: int,
128+
_visited: Set[str],
129+
next_level: List[Tuple[str, Optional[str]]],
130+
depths: Dict[str, int],
131+
) -> None:
132+
"""
133+
DFS-specific link discovery that avoids mutating the traversal
134+
'visited' set, preventing premature skips.
135+
"""
136+
next_depth = current_depth + 1
137+
if next_depth > self.max_depth:
138+
return
139+
140+
remaining_capacity = self.max_pages - self._pages_crawled
141+
if remaining_capacity <= 0:
142+
self.logger.info(
143+
f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
144+
)
145+
return
146+
147+
links = result.links.get("internal", [])
148+
if self.include_external:
149+
links += result.links.get("external", [])
150+
151+
seen = self._dfs_seen
152+
valid_links: List[Tuple[str, float]] = []
153+
154+
for link in links:
155+
raw_url = link.get("href")
156+
if not raw_url:
157+
continue
158+
159+
normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
160+
if not normalized_url or normalized_url in seen:
161+
continue
162+
163+
if not await self.can_process_url(raw_url, next_depth):
164+
self.stats.urls_skipped += 1
165+
continue
166+
167+
score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
168+
if score < self.score_threshold:
169+
self.logger.debug(
170+
f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
171+
)
172+
self.stats.urls_skipped += 1
173+
continue
174+
175+
seen.add(normalized_url)
176+
valid_links.append((normalized_url, score))
177+
178+
if len(valid_links) > remaining_capacity:
179+
if self.url_scorer:
180+
valid_links.sort(key=lambda x: x[1], reverse=True)
181+
valid_links = valid_links[:remaining_capacity]
182+
self.logger.info(
183+
f"Limiting to {remaining_capacity} URLs due to max_pages limit"
184+
)
185+
186+
for url, score in valid_links:
187+
if score:
188+
result.metadata = result.metadata or {}
189+
result.metadata["score"] = score
190+
next_level.append((url, source_url))
191+
depths[url] = next_depth

docs/examples/dfs_crawl_demo.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Simple demonstration of the DFS deep crawler visiting multiple pages.
3+
4+
Run with: python docs/examples/dfs_crawl_demo.py
5+
"""
6+
import asyncio
7+
8+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
9+
from crawl4ai.async_webcrawler import AsyncWebCrawler
10+
from crawl4ai.cache_context import CacheMode
11+
from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
12+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
13+
14+
15+
async def main() -> None:
16+
dfs_strategy = DFSDeepCrawlStrategy(
17+
max_depth=3,
18+
max_pages=50,
19+
include_external=False,
20+
)
21+
22+
config = CrawlerRunConfig(
23+
deep_crawl_strategy=dfs_strategy,
24+
cache_mode=CacheMode.BYPASS,
25+
markdown_generator=DefaultMarkdownGenerator(),
26+
stream=True,
27+
)
28+
29+
seed_url = "https://docs.python.org/3/" # Plenty of internal links
30+
31+
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
32+
async for result in await crawler.arun(url=seed_url, config=config):
33+
depth = result.metadata.get("depth")
34+
status = "SUCCESS" if result.success else "FAILED"
35+
print(f"[{status}] depth={depth} url={result.url}")
36+
37+
38+
if __name__ == "__main__":
39+
asyncio.run(main())

0 commit comments

Comments
 (0)