From fe3b27dcefb5934cc3f564232028ec3647e4d703 Mon Sep 17 00:00:00 2001 From: ivarlovlie Date: Wed, 16 Nov 2022 15:56:19 +0700 Subject: feat: Add support for nrk.no/nyheter --- src/Services/GrabberService.cs | 95 ++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 46 deletions(-) (limited to 'src/Services') diff --git a/src/Services/GrabberService.cs b/src/Services/GrabberService.cs index 3974a9e..4886023 100644 --- a/src/Services/GrabberService.cs +++ b/src/Services/GrabberService.cs @@ -44,56 +44,59 @@ public class GrabberService using var md5 = MD5.Create(); var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); return await _memoryCache.GetOrCreateAsync(articleFilePrefix, async entry => { - entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); - var source = await GrabSourceAsync(url, articleFilePrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsArticle() { - CachedAt = source.CacheFileCreatedAt, - Href = url, - Title = doc.QuerySelector("h1.title")?.TextContent, - Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, - Authors = new List() - }; - - foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { - var author = new NewsArticle.Author() { - Name = authorNode.QuerySelector(".author__name")?.TextContent, - Title = authorNode.QuerySelector(".author__role")?.TextContent + entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); + var source = await GrabSourceAsync(url, articleFilePrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsArticle() { + CachedAt = source.CacheFileCreatedAt, + Href = url, + Title = doc.QuerySelector("h1.title")?.TextContent, + Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, + Authors = new List() }; - result.Authors.Add(author); - } - - DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); - DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); - result.UpdatedAt = modified; - result.PublishedAt = published; + foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { + var author = new NewsArticle.Author() { + Name = authorNode.QuerySelector(".author__name")?.TextContent, + Title = authorNode.QuerySelector(".author__role")?.TextContent + }; + result.Authors.Add(author); + } - if (doc.QuerySelector("kortstokk-app") != default) { - var excludes = new List() { - ".dhks-background", - ".dhks-actions", - ".dhks-credits", - ".dhks-sticky-reset", - ".dhks-byline" - }; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', excludes)); - } else { - var excludes = new List() { - ".compilation-reference", - ".section-reference", - ".widget", - ".image-reference", - ".video-reference", - ".article-body--updating", - ".reference" - }; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', excludes)); - } + DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); + DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); + + result.UpdatedAt = modified; + result.PublishedAt = published; + + if (doc.QuerySelector("kortstokk-app") != default) { + var excludes = new List() { + ".dhks-background", + ".dhks-actions", + ".dhks-credits", + ".dhks-sticky-reset", + ".dhks-byline" + }; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', excludes)); + } else if (url.Contains("nrk.no/nyheter")) { + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".bulletin-text").InnerHtml); + } else { + var excludes = new List() { + ".compilation-reference", + ".section-reference", + ".widget", + ".image-reference", + ".video-reference", + ".article-body--updating", + ".reference" + }; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', excludes)); + } - return result; - }); + return result; + }) + ; } public async Task GrabNrkAsync() { -- cgit v1.3