88
99class DFSDeepCrawlStrategy (BFSDeepCrawlStrategy ):
1010 """
11- Depth-First Search (DFS) deep crawling strategy .
11+ Depth-first deep crawling with familiar BFS rules .
1212
13- Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
14- Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
13+ We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
14+ but walk the graph with a stack so we fully explore one branch before hopping to the
15+ next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
16+ discovery time without accidentally marking them as “already crawled”.
1517 """
1618
1719 def __init__ (self , * args , ** kwargs ):
1820 super ().__init__ (* args , ** kwargs )
1921 self ._dfs_seen : Set [str ] = set ()
2022
2123 def _reset_seen (self , start_url : str ) -> None :
24+ """Start each crawl with a clean dedupe set seeded with the root URL."""
2225 self ._dfs_seen = {start_url }
2326
2427 async def _arun_batch (
@@ -28,8 +31,12 @@ async def _arun_batch(
2831 config : CrawlerRunConfig ,
2932 ) -> List [CrawlResult ]:
3033 """
31- Batch (non-streaming) DFS mode.
32- Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
34+ Crawl level-by-level but emit results at the end.
35+
36+ We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
37+ hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
38+ in control of traversal. Every successful page bumps ``_pages_crawled`` and
39+ seeds new stack items discovered via :meth:`link_discovery`.
3340 """
3441 visited : Set [str ] = set ()
3542 # Stack items: (url, parent_url, depth)
@@ -81,8 +88,11 @@ async def _arun_stream(
8188 config : CrawlerRunConfig ,
8289 ) -> AsyncGenerator [CrawlResult , None ]:
8390 """
84- Streaming DFS mode.
85- Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
91+ Same traversal as :meth:`_arun_batch`, but yield pages immediately.
92+
93+ Each popped URL is crawled, its metadata annotated, then the result gets
94+ yielded before we even look at the next stack entry. Successful crawls
95+ still feed :meth:`link_discovery`, keeping DFS order intact.
8696 """
8797 visited : Set [str ] = set ()
8898 stack : List [Tuple [str , Optional [str ], int ]] = [(start_url , None , 0 )]
@@ -130,8 +140,27 @@ async def link_discovery(
130140 depths : Dict [str , int ],
131141 ) -> None :
132142 """
133- DFS-specific link discovery that avoids mutating the traversal
134- 'visited' set, preventing premature skips.
143+ Find the next URLs we should push onto the DFS stack.
144+
145+ Parameters
146+ ----------
147+ result : CrawlResult
148+ Output of the page we just crawled; its ``links`` block is our raw material.
149+ source_url : str
150+ URL of the parent page; stored so callers can track ancestry.
151+ current_depth : int
152+ Depth of the parent; children naturally sit at ``current_depth + 1``.
153+ _visited : Set[str]
154+ Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
155+ next_level : list of tuples
156+ The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
157+ depths : dict
158+ Shared depth map so future metadata tagging knows how deep each URL lives.
159+
160+ Notes
161+ -----
162+ - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
163+ - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
135164 """
136165 next_depth = current_depth + 1
137166 if next_depth > self .max_depth :
0 commit comments