22using System . Net . Http . Headers ;
33using System . Text ;
44using System . Xml ;
5+ using System . Xml . Linq ;
56using System . Xml . Serialization ;
67
78namespace RobotsSharpParser
@@ -50,7 +51,9 @@ public Robots(Uri websiteUri, string userAgent)
5051 _robotsUri = robots ;
5152 HttpClientHandler handler = new HttpClientHandler
5253 {
53- AutomaticDecompression = System . Net . DecompressionMethods . GZip | System . Net . DecompressionMethods . Deflate
54+ AutomaticDecompression = System . Net . DecompressionMethods . All ,
55+ AllowAutoRedirect = true ,
56+ MaxAutomaticRedirections = 5
5457 } ;
5558 _client = new HttpClient ( handler , true ) ;
5659 _client . DefaultRequestHeaders . TryAddWithoutValidation ( "User-Agent" , userAgent ) ;
@@ -205,13 +208,7 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
205208 if ( tSitemap is null )
206209 throw new ArgumentNullException ( nameof ( tSitemap ) , "sitemap requires a value" ) ;
207210
208- MemoryStream stream = new MemoryStream ( ) ;
209- Stream rawstream = await _client . GetStreamAsync ( tSitemap . loc ) ;
210- rawstream . CopyTo ( stream ) ;
211-
212- if ( ! TryDecompress ( stream , out byte [ ] bytes ) )
213- bytes = stream . ToArray ( ) ;
214-
211+ var bytes = await _client . GetByteArrayAsync ( tSitemap . loc ) ;
215212 if ( TryDeserializeXMLStream ( bytes , out urlset ? urlSet ) && urlSet ? . url is not null )
216213 return urlSet . url ;
217214 else
@@ -221,14 +218,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
221218 #endregion
222219
223220 private async Task < IReadOnlyList < tSitemap > > GetSitemapsInternal ( string sitemapUrl )
224- {
225- MemoryStream stream = new MemoryStream ( ) ;
226- Stream rawstream = await _client . GetStreamAsync ( sitemapUrl ) ;
227- rawstream . CopyTo ( stream ) ;
228-
229- if ( ! TryDecompress ( stream , out byte [ ] bytes ) )
230- bytes = stream . ToArray ( ) ;
231-
221+ {
222+ var bytes = await _client . GetByteArrayAsync ( sitemapUrl ) ;
232223 if ( TryDeserializeXMLStream ( bytes , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
233224 return sitemapIndex . sitemap ;
234225 else
@@ -238,13 +229,7 @@ private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUr
238229 private readonly List < tUrl > _sitemapLinks = new List < tUrl > ( 1000000 ) ;
239230 private async Task GetSitemapLinksInternal ( string siteIndex )
240231 {
241- MemoryStream stream = new MemoryStream ( ) ;
242- Stream rawstream = await _client . GetStreamAsync ( siteIndex ) ;
243- rawstream . CopyTo ( stream ) ;
244-
245- if ( ! TryDecompress ( stream , out byte [ ] bytes ) )
246- bytes = stream . ToArray ( ) ;
247-
232+ var bytes = await _client . GetByteArrayAsync ( siteIndex ) ;
248233 if ( TryDeserializeXMLStream ( bytes , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
249234 {
250235 foreach ( tSitemap sitemap in sitemapIndex . sitemap )
@@ -264,50 +249,38 @@ private async Task GetSitemapLinksInternal(string siteIndex)
264249
265250 private bool TryDeserializeXMLStream < T > ( byte [ ] bytes , out T ? xmlValue )
266251 {
267- using ( StringReader sr = new StringReader ( Encoding . UTF8 . GetString ( bytes ) ) )
268- {
269- return TryDeserializeXMLStream ( sr , out xmlValue ) ;
270- }
252+ var stringVal = Encoding . UTF8 . GetString ( bytes ) ;
253+ stringVal = StripVersionFromString ( stringVal ) ;
254+
255+ using StringReader sr = new StringReader ( stringVal ) ;
256+ return TryDeserializeXMLStream ( sr , out xmlValue ) ;
271257 }
272258
273259 private bool TryDeserializeXMLStream < T > ( TextReader reader , out T ? xmlValue )
274260 {
275261 try
276262 {
277- using ( XmlReader xmlReader = XmlReader . Create ( reader ) )
278- {
279- XmlSerializer serializer = new XmlSerializer ( typeof ( T ) ) ;
280- xmlValue = ( T ? ) serializer . Deserialize ( xmlReader ) ;
281- return xmlValue is not null ;
282- }
263+ using XmlReader xmlReader = XmlReader . Create ( reader , new XmlReaderSettings ( )
264+ {
265+ ValidationType = ValidationType . None
266+ } ) ;
267+ XmlSerializer serializer = new XmlSerializer ( typeof ( T ) ) ;
268+ xmlValue = ( T ? ) serializer . Deserialize ( xmlReader ) ;
269+ return xmlValue is not null ;
283270 }
284271 catch
285272 {
286273 xmlValue = default ;
287274 return false ;
288275 }
289- }
290-
291- private bool TryDecompress ( Stream stream , out byte [ ] bytes )
292- {
293- try
294- {
295- using ( MemoryStream decompressedStream = new MemoryStream ( ) )
296- {
297- stream . Position = 0 ;
298- using ( GZipStream decompressionStream = new GZipStream ( stream , CompressionMode . Decompress ) )
299- {
300- decompressionStream . CopyTo ( decompressedStream ) ;
301- bytes = decompressedStream . ToArray ( ) ;
302- }
303- }
304- return true ;
305- }
306- catch
307- {
308- bytes = new byte [ 0 ] ;
309- return false ;
310- }
276+ }
277+
278+ private string StripVersionFromString ( string val )
279+ {
280+ var endChar = val . IndexOf ( "?>" ) ;
281+ if ( endChar != - 1 )
282+ return val . Remove ( 0 , endChar + 2 ) ;
283+ return val ;
311284 }
312285
313286 public void Dispose ( )
0 commit comments