|
1 | | -using System; |
2 | | -using System.Collections.Generic; |
3 | | -using System.Linq; |
4 | | -using System.Net; |
| 1 | +using System.Net; |
5 | 2 | using System.ServiceModel.Syndication; |
6 | 3 | using System.Text.RegularExpressions; |
7 | 4 | using System.Xml.Linq; |
8 | 5 |
|
9 | | -namespace SimpleFeedReader |
| 6 | +namespace SimpleFeedReader; |
| 7 | + |
| 8 | +/// <summary> |
| 9 | +/// The <see cref="DefaultFeedItemNormalizer"/> normalizes <see cref="FeedItem.Title"/>, |
| 10 | +/// <see cref="FeedItem.Content"/> and <see cref="FeedItem.Summary"/> of <see cref="FeedItem"/>s to the point where |
| 11 | +/// they no longer contain any HTML, redundant whitespace, un-normalized unicode chars and other control chars like |
| 12 | +/// tabs, newlines or backspaces. The <see cref="FeedItem"/>'s <see cref="FeedItem.Date"/> property will contain |
| 13 | +/// whichever date is latest; the <see cref="FeedItem.PublishDate"/> or <see cref="FeedItem.LastUpdatedDate"/>. |
| 14 | +/// </summary> |
| 15 | +/// <remarks> |
| 16 | +/// You can implement a normalizer yourself by implementing the <see cref="IFeedItemNormalizer"/> interface. |
| 17 | +/// </remarks> |
| 18 | +public class DefaultFeedItemNormalizer : IFeedItemNormalizer |
10 | 19 | { |
| 20 | + private static readonly Regex _htmlregex = new(@"<[^>]*>", RegexOptions.Compiled); //@"<(.|\n)*?>" |
| 21 | + private static readonly Regex _controlcodesregex = new(@"[\x00-\x1F\x7f]", RegexOptions.Compiled); |
| 22 | + private static readonly Regex _whitespaceregex = new(@"\s{2,}", RegexOptions.Compiled); |
| 23 | + |
11 | 24 | /// <summary> |
12 | | - /// The <see cref="DefaultFeedItemNormalizer"/> normalizes <see cref="FeedItem.Title"/>, |
13 | | - /// <see cref="FeedItem.Content"/> and <see cref="FeedItem.Summary"/> of <see cref="FeedItem"/>s to the point where |
14 | | - /// they no longer contain any HTML, redundant whitespace, un-normalized unicode chars and other control chars like |
15 | | - /// tabs, newlines or backspaces. The <see cref="FeedItem"/>'s <see cref="FeedItem.Date"/> property will contain |
16 | | - /// whichever date is latest; the <see cref="FeedItem.PublishDate"/> or <see cref="FeedItem.LastUpdatedDate"/>. |
| 25 | + /// Normalizes a SyndicationItem into a FeedItem. |
17 | 26 | /// </summary> |
18 | | - /// <remarks> |
19 | | - /// You can implement a normalizer yourself by implementing the <see cref="IFeedItemNormalizer"/> interface. |
20 | | - /// </remarks> |
21 | | - public class DefaultFeedItemNormalizer : IFeedItemNormalizer |
| 27 | + /// <param name="feed">The <see cref="SyndicationFeed"/> on which the item was retrieved.</param> |
| 28 | + /// <param name="item">A <see cref="SyndicationItem"/> to normalize into a <see cref="FeedItem"/>.</param> |
| 29 | + /// <returns>Returns a normalized <see cref="FeedItem"/>.</returns> |
| 30 | + public virtual FeedItem Normalize(SyndicationFeed feed, SyndicationItem item) |
22 | 31 | { |
23 | | - private static readonly Regex _htmlregex = new Regex(@"<[^>]*>", RegexOptions.Compiled); //@"<(.|\n)*?>" |
24 | | - private static readonly Regex _controlcodesregex = new Regex(@"[\x00-\x1F\x7f]", RegexOptions.Compiled); |
25 | | - private static readonly Regex _whitespaceregex = new Regex(@"\s{2,}", RegexOptions.Compiled); |
| 32 | + var alternatelink = item.Links.FirstOrDefault(l => l.RelationshipType == null || l.RelationshipType.Equals("alternate", StringComparison.OrdinalIgnoreCase)); |
26 | 33 |
|
27 | | - /// <summary> |
28 | | - /// Normalizes a SyndicationItem into a FeedItem. |
29 | | - /// </summary> |
30 | | - /// <param name="feed">The <see cref="SyndicationFeed"/> on which the item was retrieved.</param> |
31 | | - /// <param name="item">A <see cref="SyndicationItem"/> to normalize into a <see cref="FeedItem"/>.</param> |
32 | | - /// <returns>Returns a normalized <see cref="FeedItem"/>.</returns> |
33 | | - public virtual FeedItem Normalize(SyndicationFeed feed, SyndicationItem item) |
| 34 | + var itemuri = alternatelink == null && !Uri.TryCreate(item.Id, UriKind.Absolute, out var parsed) ? parsed : alternatelink.GetAbsoluteUri(); |
| 35 | + return new FeedItem |
34 | 36 | { |
35 | | - var alternatelink = item.Links.FirstOrDefault(l => l.RelationshipType == null || l.RelationshipType.Equals("alternate", StringComparison.OrdinalIgnoreCase)); |
36 | | - |
37 | | - var itemuri = alternatelink == null && !Uri.TryCreate(item.Id, UriKind.Absolute, out var parsed) ? parsed : alternatelink.GetAbsoluteUri(); |
38 | | - return new FeedItem |
39 | | - { |
40 | | - Id = string.IsNullOrEmpty(item.Id) ? null : item.Id.Trim(), |
41 | | - Title = item.Title == null ? null : Normalize(item.Title.Text), |
42 | | - Content = item.Content == null ? null : Normalize(((TextSyndicationContent)item.Content).Text), |
43 | | - Summary = item.Summary == null ? null : Normalize(item.Summary.Text), |
44 | | - PublishDate = item.PublishDate, |
45 | | - LastUpdatedDate = item.LastUpdatedTime == DateTimeOffset.MinValue ? item.PublishDate : item.LastUpdatedTime, |
46 | | - Uri = itemuri, |
47 | | - Images = GetFeedItemImages(item), |
48 | | - Categories = item.Categories.Select(c => c.Name) |
49 | | - }; |
50 | | - } |
| 37 | + Id = string.IsNullOrEmpty(item.Id) ? null : item.Id.Trim(), |
| 38 | + Title = item.Title == null ? null : Normalize(item.Title.Text), |
| 39 | + Content = item.Content == null ? null : Normalize(((TextSyndicationContent)item.Content).Text), |
| 40 | + Summary = item.Summary == null ? null : Normalize(item.Summary.Text), |
| 41 | + PublishDate = item.PublishDate, |
| 42 | + LastUpdatedDate = item.LastUpdatedTime == DateTimeOffset.MinValue ? item.PublishDate : item.LastUpdatedTime, |
| 43 | + Uri = itemuri, |
| 44 | + Images = GetFeedItemImages(item), |
| 45 | + Categories = item.Categories.Select(c => c.Name) |
| 46 | + }; |
| 47 | + } |
51 | 48 |
|
52 | | - private static IEnumerable<Uri> GetFeedItemImages(SyndicationItem item) => item.ElementExtensions |
53 | | - .Where(p => p.OuterName.Equals("image")) |
54 | | - .Select(p => new Uri(p.GetObject<XElement>().Value)); |
| 49 | + private static IEnumerable<Uri> GetFeedItemImages(SyndicationItem item) => item.ElementExtensions |
| 50 | + .Where(p => p.OuterName.Equals("image")) |
| 51 | + .Select(p => new Uri(p.GetObject<XElement>().Value)); |
55 | 52 |
|
56 | | - private static string Normalize(string value) |
| 53 | + private static string Normalize(string value) |
| 54 | + { |
| 55 | + if (!string.IsNullOrEmpty(value)) |
57 | 56 | { |
58 | | - if (!string.IsNullOrEmpty(value)) |
| 57 | + value = HtmlDecode(value); |
| 58 | + if (string.IsNullOrEmpty(value)) |
59 | 59 | { |
60 | | - value = HtmlDecode(value); |
61 | | - if (string.IsNullOrEmpty(value)) |
62 | | - { |
63 | | - return value; |
64 | | - } |
65 | | - |
66 | | - value = StripHTML(value); |
67 | | - value = StripDoubleOrMoreWhiteSpace(RemoveControlChars(value)); |
68 | | - value = value.Normalize().Trim(); |
| 60 | + return value; |
69 | 61 | } |
70 | | - return value; |
| 62 | + |
| 63 | + value = StripHTML(value); |
| 64 | + value = StripDoubleOrMoreWhiteSpace(RemoveControlChars(value)); |
| 65 | + value = value.Normalize().Trim(); |
71 | 66 | } |
| 67 | + return value; |
| 68 | + } |
72 | 69 |
|
73 | | - private static string RemoveControlChars(string value) => _controlcodesregex.Replace(value, " "); |
| 70 | + private static string RemoveControlChars(string value) => _controlcodesregex.Replace(value, " "); |
74 | 71 |
|
75 | | - private static string StripDoubleOrMoreWhiteSpace(string value) => _whitespaceregex.Replace(value, " "); |
| 72 | + private static string StripDoubleOrMoreWhiteSpace(string value) => _whitespaceregex.Replace(value, " "); |
76 | 73 |
|
77 | | - private static string StripHTML(string value) => _htmlregex.Replace(value, " "); |
| 74 | + private static string StripHTML(string value) => _htmlregex.Replace(value, " "); |
78 | 75 |
|
79 | | - private static string HtmlDecode(string value, int threshold = 5) |
| 76 | + private static string HtmlDecode(string value, int threshold = 5) |
| 77 | + { |
| 78 | + var c = 0; |
| 79 | + var newvalue = WebUtility.HtmlDecode(value); |
| 80 | + while (!newvalue.Equals(value) && c < threshold) //Keep decoding (if a string is double/triple/... encoded; we want the original) |
80 | 81 | { |
81 | | - var c = 0; |
82 | | - var newvalue = WebUtility.HtmlDecode(value); |
83 | | - while (!newvalue.Equals(value) && c < threshold) //Keep decoding (if a string is double/triple/... encoded; we want the original) |
84 | | - { |
85 | | - c++; |
86 | | - value = newvalue; |
87 | | - newvalue = WebUtility.HtmlDecode(value); |
88 | | - } |
89 | | - return c >= threshold ? null : newvalue; |
| 82 | + c++; |
| 83 | + value = newvalue; |
| 84 | + newvalue = WebUtility.HtmlDecode(value); |
90 | 85 | } |
| 86 | + return c >= threshold ? null : newvalue; |
91 | 87 | } |
92 | 88 | } |
0 commit comments