44from ..models import CrawlResult
55from .bfs_strategy import BFSDeepCrawlStrategy # noqa
66from ..types import AsyncWebCrawler , CrawlerRunConfig
7+ from ..utils import normalize_url_for_deep_crawl
78
89class DFSDeepCrawlStrategy (BFSDeepCrawlStrategy ):
910 """
@@ -12,6 +13,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
1213 Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
1314 Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
1415 """
16+
17+ def __init__ (self , * args , ** kwargs ):
18+ super ().__init__ (* args , ** kwargs )
19+ self ._dfs_seen : Set [str ] = set ()
20+
21+ def _reset_seen (self , start_url : str ) -> None :
22+ self ._dfs_seen = {start_url }
23+
1524 async def _arun_batch (
1625 self ,
1726 start_url : str ,
@@ -27,6 +36,7 @@ async def _arun_batch(
2736 stack : List [Tuple [str , Optional [str ], int ]] = [(start_url , None , 0 )]
2837 depths : Dict [str , int ] = {start_url : 0 }
2938 results : List [CrawlResult ] = []
39+ self ._reset_seen (start_url )
3040
3141 while stack and not self ._cancel_event .is_set ():
3242 url , parent , depth = stack .pop ()
@@ -77,6 +87,7 @@ async def _arun_stream(
7787 visited : Set [str ] = set ()
7888 stack : List [Tuple [str , Optional [str ], int ]] = [(start_url , None , 0 )]
7989 depths : Dict [str , int ] = {start_url : 0 }
90+ self ._reset_seen (start_url )
8091
8192 while stack and not self ._cancel_event .is_set ():
8293 url , parent , depth = stack .pop ()
@@ -108,3 +119,73 @@ async def _arun_stream(
108119 for new_url , new_parent in reversed (new_links ):
109120 new_depth = depths .get (new_url , depth + 1 )
110121 stack .append ((new_url , new_parent , new_depth ))
122+
123+ async def link_discovery (
124+ self ,
125+ result : CrawlResult ,
126+ source_url : str ,
127+ current_depth : int ,
128+ _visited : Set [str ],
129+ next_level : List [Tuple [str , Optional [str ]]],
130+ depths : Dict [str , int ],
131+ ) -> None :
132+ """
133+ DFS-specific link discovery that avoids mutating the traversal
134+ 'visited' set, preventing premature skips.
135+ """
136+ next_depth = current_depth + 1
137+ if next_depth > self .max_depth :
138+ return
139+
140+ remaining_capacity = self .max_pages - self ._pages_crawled
141+ if remaining_capacity <= 0 :
142+ self .logger .info (
143+ f"Max pages limit ({ self .max_pages } ) reached, stopping link discovery"
144+ )
145+ return
146+
147+ links = result .links .get ("internal" , [])
148+ if self .include_external :
149+ links += result .links .get ("external" , [])
150+
151+ seen = self ._dfs_seen
152+ valid_links : List [Tuple [str , float ]] = []
153+
154+ for link in links :
155+ raw_url = link .get ("href" )
156+ if not raw_url :
157+ continue
158+
159+ normalized_url = normalize_url_for_deep_crawl (raw_url , source_url )
160+ if not normalized_url or normalized_url in seen :
161+ continue
162+
163+ if not await self .can_process_url (raw_url , next_depth ):
164+ self .stats .urls_skipped += 1
165+ continue
166+
167+ score = self .url_scorer .score (normalized_url ) if self .url_scorer else 0
168+ if score < self .score_threshold :
169+ self .logger .debug (
170+ f"URL { normalized_url } skipped: score { score } below threshold { self .score_threshold } "
171+ )
172+ self .stats .urls_skipped += 1
173+ continue
174+
175+ seen .add (normalized_url )
176+ valid_links .append ((normalized_url , score ))
177+
178+ if len (valid_links ) > remaining_capacity :
179+ if self .url_scorer :
180+ valid_links .sort (key = lambda x : x [1 ], reverse = True )
181+ valid_links = valid_links [:remaining_capacity ]
182+ self .logger .info (
183+ f"Limiting to { remaining_capacity } URLs due to max_pages limit"
184+ )
185+
186+ for url , score in valid_links :
187+ if score :
188+ result .metadata = result .metadata or {}
189+ result .metadata ["score" ] = score
190+ next_level .append ((url , source_url ))
191+ depths [url ] = next_depth
0 commit comments