|
4 | 4 | from ..models import CrawlResult |
5 | 5 | from .bfs_strategy import BFSDeepCrawlStrategy # noqa |
6 | 6 | from ..types import AsyncWebCrawler, CrawlerRunConfig |
| 7 | +from ..utils import normalize_url_for_deep_crawl |
7 | 8 |
|
8 | 9 | class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): |
9 | 10 | """ |
10 | | - Depth-First Search (DFS) deep crawling strategy. |
| 11 | + Depth-first deep crawling with familiar BFS rules. |
11 | 12 |
|
12 | | - Inherits URL validation and link discovery from BFSDeepCrawlStrategy. |
13 | | - Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal. |
| 13 | + We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`, |
| 14 | + but walk the graph with a stack so we fully explore one branch before hopping to the |
| 15 | + next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at |
| 16 | + discovery time without accidentally marking them as “already crawled”. |
14 | 17 | """ |
| 18 | + |
| 19 | + def __init__(self, *args, **kwargs): |
| 20 | + super().__init__(*args, **kwargs) |
| 21 | + self._dfs_seen: Set[str] = set() |
| 22 | + |
| 23 | + def _reset_seen(self, start_url: str) -> None: |
| 24 | + """Start each crawl with a clean dedupe set seeded with the root URL.""" |
| 25 | + self._dfs_seen = {start_url} |
| 26 | + |
15 | 27 | async def _arun_batch( |
16 | 28 | self, |
17 | 29 | start_url: str, |
18 | 30 | crawler: AsyncWebCrawler, |
19 | 31 | config: CrawlerRunConfig, |
20 | 32 | ) -> List[CrawlResult]: |
21 | 33 | """ |
22 | | - Batch (non-streaming) DFS mode. |
23 | | - Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list. |
| 34 | + Crawl level-by-level but emit results at the end. |
| 35 | +
|
| 36 | + We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and |
| 37 | + hand it to ``crawler.arun_many`` with deep crawling disabled so we remain |
| 38 | + in control of traversal. Every successful page bumps ``_pages_crawled`` and |
| 39 | + seeds new stack items discovered via :meth:`link_discovery`. |
24 | 40 | """ |
25 | 41 | visited: Set[str] = set() |
26 | 42 | # Stack items: (url, parent_url, depth) |
27 | 43 | stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] |
28 | 44 | depths: Dict[str, int] = {start_url: 0} |
29 | 45 | results: List[CrawlResult] = [] |
| 46 | + self._reset_seen(start_url) |
30 | 47 |
|
31 | 48 | while stack and not self._cancel_event.is_set(): |
32 | 49 | url, parent, depth = stack.pop() |
@@ -71,12 +88,16 @@ async def _arun_stream( |
71 | 88 | config: CrawlerRunConfig, |
72 | 89 | ) -> AsyncGenerator[CrawlResult, None]: |
73 | 90 | """ |
74 | | - Streaming DFS mode. |
75 | | - Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available. |
| 91 | + Same traversal as :meth:`_arun_batch`, but yield pages immediately. |
| 92 | +
|
| 93 | + Each popped URL is crawled, its metadata annotated, then the result gets |
| 94 | + yielded before we even look at the next stack entry. Successful crawls |
| 95 | + still feed :meth:`link_discovery`, keeping DFS order intact. |
76 | 96 | """ |
77 | 97 | visited: Set[str] = set() |
78 | 98 | stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] |
79 | 99 | depths: Dict[str, int] = {start_url: 0} |
| 100 | + self._reset_seen(start_url) |
80 | 101 |
|
81 | 102 | while stack and not self._cancel_event.is_set(): |
82 | 103 | url, parent, depth = stack.pop() |
@@ -108,3 +129,92 @@ async def _arun_stream( |
108 | 129 | for new_url, new_parent in reversed(new_links): |
109 | 130 | new_depth = depths.get(new_url, depth + 1) |
110 | 131 | stack.append((new_url, new_parent, new_depth)) |
| 132 | + |
| 133 | + async def link_discovery( |
| 134 | + self, |
| 135 | + result: CrawlResult, |
| 136 | + source_url: str, |
| 137 | + current_depth: int, |
| 138 | + _visited: Set[str], |
| 139 | + next_level: List[Tuple[str, Optional[str]]], |
| 140 | + depths: Dict[str, int], |
| 141 | + ) -> None: |
| 142 | + """ |
| 143 | + Find the next URLs we should push onto the DFS stack. |
| 144 | +
|
| 145 | + Parameters |
| 146 | + ---------- |
| 147 | + result : CrawlResult |
| 148 | + Output of the page we just crawled; its ``links`` block is our raw material. |
| 149 | + source_url : str |
| 150 | + URL of the parent page; stored so callers can track ancestry. |
| 151 | + current_depth : int |
| 152 | + Depth of the parent; children naturally sit at ``current_depth + 1``. |
| 153 | + _visited : Set[str] |
| 154 | + Present to match the BFS signature, but we rely on ``_dfs_seen`` instead. |
| 155 | + next_level : list of tuples |
| 156 | + The stack buffer supplied by the caller; we append new ``(url, parent)`` items here. |
| 157 | + depths : dict |
| 158 | + Shared depth map so future metadata tagging knows how deep each URL lives. |
| 159 | +
|
| 160 | + Notes |
| 161 | + ----- |
| 162 | + - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard. |
| 163 | + - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent. |
| 164 | + """ |
| 165 | + next_depth = current_depth + 1 |
| 166 | + if next_depth > self.max_depth: |
| 167 | + return |
| 168 | + |
| 169 | + remaining_capacity = self.max_pages - self._pages_crawled |
| 170 | + if remaining_capacity <= 0: |
| 171 | + self.logger.info( |
| 172 | + f"Max pages limit ({self.max_pages}) reached, stopping link discovery" |
| 173 | + ) |
| 174 | + return |
| 175 | + |
| 176 | + links = result.links.get("internal", []) |
| 177 | + if self.include_external: |
| 178 | + links += result.links.get("external", []) |
| 179 | + |
| 180 | + seen = self._dfs_seen |
| 181 | + valid_links: List[Tuple[str, float]] = [] |
| 182 | + |
| 183 | + for link in links: |
| 184 | + raw_url = link.get("href") |
| 185 | + if not raw_url: |
| 186 | + continue |
| 187 | + |
| 188 | + normalized_url = normalize_url_for_deep_crawl(raw_url, source_url) |
| 189 | + if not normalized_url or normalized_url in seen: |
| 190 | + continue |
| 191 | + |
| 192 | + if not await self.can_process_url(raw_url, next_depth): |
| 193 | + self.stats.urls_skipped += 1 |
| 194 | + continue |
| 195 | + |
| 196 | + score = self.url_scorer.score(normalized_url) if self.url_scorer else 0 |
| 197 | + if score < self.score_threshold: |
| 198 | + self.logger.debug( |
| 199 | + f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}" |
| 200 | + ) |
| 201 | + self.stats.urls_skipped += 1 |
| 202 | + continue |
| 203 | + |
| 204 | + seen.add(normalized_url) |
| 205 | + valid_links.append((normalized_url, score)) |
| 206 | + |
| 207 | + if len(valid_links) > remaining_capacity: |
| 208 | + if self.url_scorer: |
| 209 | + valid_links.sort(key=lambda x: x[1], reverse=True) |
| 210 | + valid_links = valid_links[:remaining_capacity] |
| 211 | + self.logger.info( |
| 212 | + f"Limiting to {remaining_capacity} URLs due to max_pages limit" |
| 213 | + ) |
| 214 | + |
| 215 | + for url, score in valid_links: |
| 216 | + if score: |
| 217 | + result.metadata = result.metadata or {} |
| 218 | + result.metadata["score"] = score |
| 219 | + next_level.append((url, source_url)) |
| 220 | + depths[url] = next_depth |
0 commit comments