@@ -845,6 +845,15 @@ async def _iter_sitemap(self, url: str):
845845 return
846846
847847 data = gzip .decompress (r .content ) if url .endswith (".gz" ) else r .content
848+ base_url = str (r .url )
849+
850+ def _normalize_loc (raw : Optional [str ]) -> Optional [str ]:
851+ if not raw :
852+ return None
853+ normalized = urljoin (base_url , raw .strip ())
854+ if not normalized :
855+ return None
856+ return normalized
848857
849858 # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
850859 is_sitemap_index = False
@@ -857,25 +866,42 @@ async def _iter_sitemap(self, url: str):
857866 # Use XML parser for sitemaps, not HTML parser
858867 parser = etree .XMLParser (recover = True )
859868 root = etree .fromstring (data , parser = parser )
869+ # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
870+ sitemap_loc_nodes = root .xpath ("//*[local-name()='sitemap']/*[local-name()='loc']" )
871+ url_loc_nodes = root .xpath ("//*[local-name()='url']/*[local-name()='loc']" )
860872
861- # Define namespace for sitemap
862- ns = {'s' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
873+ self ._log (
874+ "debug" ,
875+ "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered" ,
876+ params = {
877+ "url" : url ,
878+ "sitemap_count" : len (sitemap_loc_nodes ),
879+ "url_count" : len (url_loc_nodes ),
880+ },
881+ tag = "URL_SEED" ,
882+ )
863883
864884 # Check for sitemap index entries
865- sitemap_locs = root .xpath ('//s:sitemap/s:loc' , namespaces = ns )
866- if sitemap_locs :
885+ if sitemap_loc_nodes :
867886 is_sitemap_index = True
868- for sitemap_elem in sitemap_locs :
869- loc = sitemap_elem . text . strip () if sitemap_elem .text else ""
887+ for sitemap_elem in sitemap_loc_nodes :
888+ loc = _normalize_loc ( sitemap_elem .text )
870889 if loc :
871890 sub_sitemaps .append (loc )
872891
873892 # If not a sitemap index, get regular URLs
874893 if not is_sitemap_index :
875- for loc_elem in root . xpath ( '//s:url/s:loc' , namespaces = ns ) :
876- loc = loc_elem . text . strip () if loc_elem .text else ""
894+ for loc_elem in url_loc_nodes :
895+ loc = _normalize_loc ( loc_elem .text )
877896 if loc :
878897 regular_urls .append (loc )
898+ if not regular_urls :
899+ self ._log (
900+ "warning" ,
901+ "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure." ,
902+ params = {"url" : url },
903+ tag = "URL_SEED" ,
904+ )
879905 except Exception as e :
880906 self ._log ("error" , "LXML parsing error for sitemap {url}: {error}" ,
881907 params = {"url" : url , "error" : str (e )}, tag = "URL_SEED" )
@@ -892,19 +918,39 @@ async def _iter_sitemap(self, url: str):
892918
893919 # Check for sitemap index entries
894920 sitemaps = root .findall ('.//sitemap' )
921+ url_entries = root .findall ('.//url' )
922+ self ._log (
923+ "debug" ,
924+ "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered" ,
925+ params = {
926+ "url" : url ,
927+ "sitemap_count" : len (sitemaps ),
928+ "url_count" : len (url_entries ),
929+ },
930+ tag = "URL_SEED" ,
931+ )
895932 if sitemaps :
896933 is_sitemap_index = True
897934 for sitemap in sitemaps :
898935 loc_elem = sitemap .find ('loc' )
899- if loc_elem is not None and loc_elem .text :
900- sub_sitemaps .append (loc_elem .text .strip ())
936+ loc = _normalize_loc (loc_elem .text if loc_elem is not None else None )
937+ if loc :
938+ sub_sitemaps .append (loc )
901939
902940 # If not a sitemap index, get regular URLs
903941 if not is_sitemap_index :
904- for url_elem in root . findall ( './/url' ) :
942+ for url_elem in url_entries :
905943 loc_elem = url_elem .find ('loc' )
906- if loc_elem is not None and loc_elem .text :
907- regular_urls .append (loc_elem .text .strip ())
944+ loc = _normalize_loc (loc_elem .text if loc_elem is not None else None )
945+ if loc :
946+ regular_urls .append (loc )
947+ if not regular_urls :
948+ self ._log (
949+ "warning" ,
950+ "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure." ,
951+ params = {"url" : url },
952+ tag = "URL_SEED" ,
953+ )
908954 except Exception as e :
909955 self ._log ("error" , "ElementTree parsing error for sitemap {url}: {error}" ,
910956 params = {"url" : url , "error" : str (e )}, tag = "URL_SEED" )
0 commit comments