Skip to content

Commit cf7767b

Browse files
author
Nicholas Bergsen
committed
decompress and allow redirects when fetching robots. Improving parsing
1 parent bedbe03 commit cf7767b

File tree

1 file changed

+28
-55
lines changed

1 file changed

+28
-55
lines changed

Robots/RobotsParser.cs

Lines changed: 28 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using System.Net.Http.Headers;
33
using System.Text;
44
using System.Xml;
5+
using System.Xml.Linq;
56
using System.Xml.Serialization;
67

78
namespace RobotsSharpParser
@@ -50,7 +51,9 @@ public Robots(Uri websiteUri, string userAgent)
5051
_robotsUri = robots;
5152
HttpClientHandler handler = new HttpClientHandler
5253
{
53-
AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate
54+
AutomaticDecompression = System.Net.DecompressionMethods.All,
55+
AllowAutoRedirect = true,
56+
MaxAutomaticRedirections = 5
5457
};
5558
_client = new HttpClient(handler, true);
5659
_client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", userAgent);
@@ -205,13 +208,7 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
205208
if (tSitemap is null)
206209
throw new ArgumentNullException(nameof(tSitemap), "sitemap requires a value");
207210

208-
MemoryStream stream = new MemoryStream();
209-
Stream rawstream = await _client.GetStreamAsync(tSitemap.loc);
210-
rawstream.CopyTo(stream);
211-
212-
if (!TryDecompress(stream, out byte[] bytes))
213-
bytes = stream.ToArray();
214-
211+
var bytes = await _client.GetByteArrayAsync(tSitemap.loc);
215212
if (TryDeserializeXMLStream(bytes, out urlset? urlSet) && urlSet?.url is not null)
216213
return urlSet.url;
217214
else
@@ -221,14 +218,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
221218
#endregion
222219

223220
private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUrl)
224-
{
225-
MemoryStream stream = new MemoryStream();
226-
Stream rawstream = await _client.GetStreamAsync(sitemapUrl);
227-
rawstream.CopyTo(stream);
228-
229-
if (!TryDecompress(stream, out byte[] bytes))
230-
bytes = stream.ToArray();
231-
221+
{
222+
var bytes = await _client.GetByteArrayAsync(sitemapUrl);
232223
if (TryDeserializeXMLStream(bytes, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
233224
return sitemapIndex.sitemap;
234225
else
@@ -238,13 +229,7 @@ private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUr
238229
private readonly List<tUrl> _sitemapLinks = new List<tUrl>(1000000);
239230
private async Task GetSitemapLinksInternal(string siteIndex)
240231
{
241-
MemoryStream stream = new MemoryStream();
242-
Stream rawstream = await _client.GetStreamAsync(siteIndex);
243-
rawstream.CopyTo(stream);
244-
245-
if (!TryDecompress(stream, out byte[] bytes))
246-
bytes = stream.ToArray();
247-
232+
var bytes = await _client.GetByteArrayAsync(siteIndex);
248233
if (TryDeserializeXMLStream(bytes, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
249234
{
250235
foreach (tSitemap sitemap in sitemapIndex.sitemap)
@@ -264,50 +249,38 @@ private async Task GetSitemapLinksInternal(string siteIndex)
264249

265250
private bool TryDeserializeXMLStream<T>(byte[] bytes, out T? xmlValue)
266251
{
267-
using (StringReader sr = new StringReader(Encoding.UTF8.GetString(bytes)))
268-
{
269-
return TryDeserializeXMLStream(sr, out xmlValue);
270-
}
252+
var stringVal = Encoding.UTF8.GetString(bytes);
253+
stringVal = StripVersionFromString(stringVal);
254+
255+
using StringReader sr = new StringReader(stringVal);
256+
return TryDeserializeXMLStream(sr, out xmlValue);
271257
}
272258

273259
private bool TryDeserializeXMLStream<T>(TextReader reader, out T? xmlValue)
274260
{
275261
try
276262
{
277-
using (XmlReader xmlReader = XmlReader.Create(reader))
278-
{
279-
XmlSerializer serializer = new XmlSerializer(typeof(T));
280-
xmlValue = (T?)serializer.Deserialize(xmlReader);
281-
return xmlValue is not null;
282-
}
263+
using XmlReader xmlReader = XmlReader.Create(reader, new XmlReaderSettings()
264+
{
265+
ValidationType = ValidationType.None
266+
});
267+
XmlSerializer serializer = new XmlSerializer(typeof(T));
268+
xmlValue = (T?)serializer.Deserialize(xmlReader);
269+
return xmlValue is not null;
283270
}
284271
catch
285272
{
286273
xmlValue = default;
287274
return false;
288275
}
289-
}
290-
291-
private bool TryDecompress(Stream stream, out byte[] bytes)
292-
{
293-
try
294-
{
295-
using (MemoryStream decompressedStream = new MemoryStream())
296-
{
297-
stream.Position = 0;
298-
using (GZipStream decompressionStream = new GZipStream(stream, CompressionMode.Decompress))
299-
{
300-
decompressionStream.CopyTo(decompressedStream);
301-
bytes = decompressedStream.ToArray();
302-
}
303-
}
304-
return true;
305-
}
306-
catch
307-
{
308-
bytes = new byte[0];
309-
return false;
310-
}
276+
}
277+
278+
private string StripVersionFromString(string val)
279+
{
280+
var endChar = val.IndexOf("?>");
281+
if(endChar != -1)
282+
return val.Remove(0, endChar + 2);
283+
return val;
311284
}
312285

313286
public void Dispose()

0 commit comments

Comments
 (0)