Skip to content

Commit be00fc3

Browse files
authored
Merge pull request unclecode#1598 from unclecode/fix/sitemap_seeder
unclecode#1559 :Add tests for sitemap parsing and URL normalization in AsyncUr…
2 parents 124ac58 + 80745bc commit be00fc3

File tree

2 files changed

+193
-13
lines changed

2 files changed

+193
-13
lines changed

crawl4ai/async_url_seeder.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,15 @@ async def _iter_sitemap(self, url: str):
845845
return
846846

847847
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
848+
base_url = str(r.url)
849+
850+
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
851+
if not raw:
852+
return None
853+
normalized = urljoin(base_url, raw.strip())
854+
if not normalized:
855+
return None
856+
return normalized
848857

849858
# Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
850859
is_sitemap_index = False
@@ -857,25 +866,42 @@ async def _iter_sitemap(self, url: str):
857866
# Use XML parser for sitemaps, not HTML parser
858867
parser = etree.XMLParser(recover=True)
859868
root = etree.fromstring(data, parser=parser)
869+
# Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
870+
sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
871+
url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
860872

861-
# Define namespace for sitemap
862-
ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
873+
self._log(
874+
"debug",
875+
"Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
876+
params={
877+
"url": url,
878+
"sitemap_count": len(sitemap_loc_nodes),
879+
"url_count": len(url_loc_nodes),
880+
},
881+
tag="URL_SEED",
882+
)
863883

864884
# Check for sitemap index entries
865-
sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
866-
if sitemap_locs:
885+
if sitemap_loc_nodes:
867886
is_sitemap_index = True
868-
for sitemap_elem in sitemap_locs:
869-
loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
887+
for sitemap_elem in sitemap_loc_nodes:
888+
loc = _normalize_loc(sitemap_elem.text)
870889
if loc:
871890
sub_sitemaps.append(loc)
872891

873892
# If not a sitemap index, get regular URLs
874893
if not is_sitemap_index:
875-
for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
876-
loc = loc_elem.text.strip() if loc_elem.text else ""
894+
for loc_elem in url_loc_nodes:
895+
loc = _normalize_loc(loc_elem.text)
877896
if loc:
878897
regular_urls.append(loc)
898+
if not regular_urls:
899+
self._log(
900+
"warning",
901+
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
902+
params={"url": url},
903+
tag="URL_SEED",
904+
)
879905
except Exception as e:
880906
self._log("error", "LXML parsing error for sitemap {url}: {error}",
881907
params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -892,19 +918,39 @@ async def _iter_sitemap(self, url: str):
892918

893919
# Check for sitemap index entries
894920
sitemaps = root.findall('.//sitemap')
921+
url_entries = root.findall('.//url')
922+
self._log(
923+
"debug",
924+
"ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
925+
params={
926+
"url": url,
927+
"sitemap_count": len(sitemaps),
928+
"url_count": len(url_entries),
929+
},
930+
tag="URL_SEED",
931+
)
895932
if sitemaps:
896933
is_sitemap_index = True
897934
for sitemap in sitemaps:
898935
loc_elem = sitemap.find('loc')
899-
if loc_elem is not None and loc_elem.text:
900-
sub_sitemaps.append(loc_elem.text.strip())
936+
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
937+
if loc:
938+
sub_sitemaps.append(loc)
901939

902940
# If not a sitemap index, get regular URLs
903941
if not is_sitemap_index:
904-
for url_elem in root.findall('.//url'):
942+
for url_elem in url_entries:
905943
loc_elem = url_elem.find('loc')
906-
if loc_elem is not None and loc_elem.text:
907-
regular_urls.append(loc_elem.text.strip())
944+
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
945+
if loc:
946+
regular_urls.append(loc)
947+
if not regular_urls:
948+
self._log(
949+
"warning",
950+
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
951+
params={"url": url},
952+
tag="URL_SEED",
953+
)
908954
except Exception as e:
909955
self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
910956
params={"url": url, "error": str(e)}, tag="URL_SEED")
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import sys
2+
from types import SimpleNamespace
3+
4+
import pytest
5+
6+
# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
7+
# optional dependency issues (e.g., incompatible wheels in CI).
8+
class _FakeBM25:
9+
def __init__(self, corpus):
10+
self._scores = [1.0] * len(corpus)
11+
12+
def get_scores(self, tokens):
13+
return self._scores
14+
15+
16+
sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
17+
18+
from crawl4ai.async_url_seeder import AsyncUrlSeeder
19+
20+
21+
class DummyResponse:
22+
def __init__(self, request_url: str, text: str):
23+
self.status_code = 200
24+
self._content = text.encode("utf-8")
25+
self.url = request_url
26+
27+
def raise_for_status(self):
28+
return None
29+
30+
@property
31+
def content(self):
32+
return self._content
33+
34+
@property
35+
def text(self):
36+
return self._content.decode("utf-8")
37+
38+
39+
class DummyAsyncClient:
40+
def __init__(self, response_map):
41+
self._responses = response_map
42+
43+
async def get(self, url, **kwargs):
44+
payload = self._responses[url]
45+
if callable(payload):
46+
payload = payload()
47+
return DummyResponse(url, payload)
48+
49+
50+
@pytest.mark.asyncio
51+
async def test_iter_sitemap_handles_namespace_less_sitemaps():
52+
xml = """<?xml version="1.0"?>
53+
<urlset>
54+
<url><loc>https://example.com/a</loc></url>
55+
<url><loc>https://example.com/b</loc></url>
56+
</urlset>
57+
"""
58+
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
59+
60+
urls = []
61+
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
62+
urls.append(u)
63+
64+
assert urls == ["https://example.com/a", "https://example.com/b"]
65+
66+
67+
@pytest.mark.asyncio
68+
async def test_iter_sitemap_handles_custom_namespace():
69+
xml = """<?xml version="1.0"?>
70+
<urlset xmlns="https://custom.namespace/schema">
71+
<url><loc>https://example.com/ns</loc></url>
72+
</urlset>
73+
"""
74+
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
75+
76+
urls = []
77+
async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
78+
urls.append(u)
79+
80+
assert urls == ["https://example.com/ns"]
81+
82+
83+
@pytest.mark.asyncio
84+
async def test_iter_sitemap_handles_namespace_index_and_children():
85+
index_xml = """<?xml version="1.0"?>
86+
<sitemapindex xmlns="http://another.example/ns">
87+
<sitemap>
88+
<loc>https://example.com/child-1.xml</loc>
89+
</sitemap>
90+
<sitemap>
91+
<loc>https://example.com/child-2.xml</loc>
92+
</sitemap>
93+
</sitemapindex>
94+
"""
95+
child_xml = """<?xml version="1.0"?>
96+
<urlset xmlns="http://irrelevant">
97+
<url><loc>https://example.com/page-{n}</loc></url>
98+
</urlset>
99+
"""
100+
responses = {
101+
"https://example.com/index.xml": index_xml,
102+
"https://example.com/child-1.xml": child_xml.format(n=1),
103+
"https://example.com/child-2.xml": child_xml.format(n=2),
104+
}
105+
seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
106+
107+
urls = []
108+
async for u in seeder._iter_sitemap("https://example.com/index.xml"):
109+
urls.append(u)
110+
111+
assert sorted(urls) == [
112+
"https://example.com/page-1",
113+
"https://example.com/page-2",
114+
]
115+
116+
117+
@pytest.mark.asyncio
118+
async def test_iter_sitemap_normalizes_relative_locations():
119+
xml = """<?xml version="1.0"?>
120+
<urlset>
121+
<url><loc>/relative-path</loc></url>
122+
<url><loc>https://example.com/absolute</loc></url>
123+
</urlset>
124+
"""
125+
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
126+
127+
urls = []
128+
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
129+
urls.append(u)
130+
131+
assert urls == [
132+
"https://example.com/relative-path",
133+
"https://example.com/absolute",
134+
]

0 commit comments

Comments
 (0)