diff options
| author | ivarlovlie <git@ivarlovlie.no> | 2022-11-16 06:06:06 +0100 |
|---|---|---|
| committer | ivarlovlie <git@ivarlovlie.no> | 2022-11-16 06:06:06 +0100 |
| commit | a7e7d83057503262a9f2b56c4fb0c18e3f8ea15c (patch) | |
| tree | 7523f6ed741fc70882bce29718d626e506855484 /src | |
| parent | f5d60e1f072dc8971a546e0e4cd0b00c89dda44b (diff) | |
| download | lettnytt-a7e7d83057503262a9f2b56c4fb0c18e3f8ea15c.tar.xz lettnytt-a7e7d83057503262a9f2b56c4fb0c18e3f8ea15c.zip | |
feat: Add memory cache
Diffstat (limited to 'src')
| -rw-r--r-- | src/Pages/Read.cshtml | 45 | ||||
| -rw-r--r-- | src/Services/GrabberService.cs | 149 |
2 files changed, 114 insertions, 80 deletions
diff --git a/src/Pages/Read.cshtml b/src/Pages/Read.cshtml index cbbd69a..63543aa 100644 --- a/src/Pages/Read.cshtml +++ b/src/Pages/Read.cshtml @@ -9,26 +9,33 @@ <h1>@Model.Source.Title</h1> <p>@Model.Source.Subtitle</p> </div> - <div style="display: flex; flex-direction: column; flex-wrap: nowrap"> - <div style="flex-direction:column"> - @if (Model.Source.PublishedAt != default) { - <small style="white-space: nowrap">Publisert: @Model.Source.PublishedAt.ToString("dd-MM-yyyy hh:mm:ss")</small> - } - @if (Model.Source.UpdatedAt != default) { - <br/> - <small style="white-space: nowrap">Oppdatert: @Model.Source.UpdatedAt.ToString("dd-MM-yyyy hh:mm:ss")</small> - } - </div> - <div style="margin: 0 5px; border: 0.5px solid black"></div> - <div style="flex-direction:column"> - @foreach (var author in Model.Source.Authors) { - <small style="white-space: nowrap"><b>@author.Name</b>: @author.Title</small> - <br/> - } - </div> - </div> </div> <div id="art-body"> @Html.Raw(Model.Source.Content) -</div>
\ No newline at end of file +</div> +<footer> + <p> + <div style="display: flex; flex-direction: column; flex-wrap: nowrap;"> + <div style="flex-direction:column"> + @foreach (var author in Model.Source.Authors) { + <small style="white-space: nowrap"><b>@author.Name</b>: @author.Title</small> + <br/> + } + </div> + <div style="flex-direction: column"> + @if (Model.Source.PublishedAt != default) { + <small style="white-space: nowrap">Publisert: @Model.Source.PublishedAt.ToString("dd-MM-yyyy hh:mm:ss")</small> + } + @if (Model.Source.UpdatedAt != default) { + <br/> + <small style="white-space: nowrap">Oppdatert: @Model.Source.UpdatedAt.ToString("dd-MM-yyyy hh:mm:ss")</small> + } + <br/> + <small> + <a href="@Model.Source.Href">Les på nrk.no</a> + </small> + </div> + </div> + </p> +</footer>
\ No newline at end of file diff --git a/src/Services/GrabberService.cs b/src/Services/GrabberService.cs index f406d28..4a92517 100644 --- a/src/Services/GrabberService.cs +++ b/src/Services/GrabberService.cs @@ -2,12 +2,14 @@ using System.Security.Cryptography; using System.Text; using AngleSharp.Html.Parser; using I2R.LightNews.Utilities; +using Microsoft.Extensions.Caching.Memory; namespace I2R.LightNews.Services; public class GrabberService { private readonly ILogger<GrabberService> _logger; + private readonly MemoryCache _memoryCache; private readonly HttpClient _http; private const string NrkPrefix = "nrkno"; private const int StaleTime = 1800; @@ -16,88 +18,113 @@ public class GrabberService HostPath = "AppData/__sitecache" }; - public GrabberService(ILogger<GrabberService> logger, HttpClient http) { + public GrabberService(ILogger<GrabberService> logger, HttpClient http, MemoryCache memoryCache) { _logger = logger; _http = http; + _memoryCache = memoryCache; } - public async Task<NewsArticle> GrabNrkArticleAsync(string url) { + private bool IsSupportedNrkUrl(string url) { var strippedUrl = url.Replace("https://", "") .Replace("http://", "") .Replace("www.", ""); - if (!strippedUrl.StartsWith("nrk.no") - || strippedUrl.StartsWith("nrk.no/mat") - || strippedUrl.StartsWith("nrk.no/tv") - || strippedUrl.StartsWith("nrk.no/radio") - || strippedUrl.StartsWith("nrk.no/xl") - ) return default; + var ignored = new List<string>() { + "nrk.no/mat", + "nrk.no/radio", + "nrk.no/tv", + "nrk.no/xl" + }; + return strippedUrl.StartsWith("nrk.no") && ignored.All(c => !strippedUrl.Contains(c)); + } + + public async Task<NewsArticle> GrabNrkArticleAsync(string url) { + if (!IsSupportedNrkUrl(url)) return default; using var md5 = MD5.Create(); var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); - var source = await GrabSourceAsync(url, articleFilePrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsArticle() { - CachedAt = source.CacheFileCreatedAt, - Href = url, - Title = doc.QuerySelector("h1.title")?.TextContent, - Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, - Authors = new List<NewsArticle.Author>() - }; - - foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { - var author = new NewsArticle.Author() { - Name = authorNode.QuerySelector(".author__name")?.TextContent, - Title = authorNode.QuerySelector(".author__role")?.TextContent + return await _memoryCache.GetOrCreateAsync(articleFilePrefix, async entry => { + entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); + var source = await GrabSourceAsync(url, articleFilePrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsArticle() { + CachedAt = source.CacheFileCreatedAt, + Href = url, + Title = doc.QuerySelector("h1.title")?.TextContent, + Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, + Authors = new List<NewsArticle.Author>() }; - result.Authors.Add(author); - } - DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); - DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); - result.UpdatedAt = modified; - result.PublishedAt = published; - if (doc.QuerySelector("kortstokk-app") != default) { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, ".dhks-background,.dhks-actions,.dhks-credits,.dhks-sticky-reset,.dhks-byline"); - } else { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, "a,.section-reference,.widget,.article-body--updating,.video-reference,.image-reference,.reference"); - } + foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { + var author = new NewsArticle.Author() { + Name = authorNode.QuerySelector(".author__name")?.TextContent, + Title = authorNode.QuerySelector(".author__role")?.TextContent + }; + result.Authors.Add(author); + } - return result; + DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); + DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); + + result.UpdatedAt = modified; + result.PublishedAt = published; + + if (doc.QuerySelector("kortstokk-app") != default) { + var excludes = new List<string>() { + ".dhks-background", + ".dhks-actions", + ".dhks-credits", + ".dhks-sticky-reset", + ".dhks-byline" + }; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', excludes)); + } else { + var excludes = new List<string>() { + ".compilation-reference", + ".section-reference", + ".widget", + ".image-reference", + ".video-reference", + ".article-body--updating", + ".reference" + }; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', excludes)); + } + + return result; + }); } public async Task<NewsSource> GrabNrkAsync() { - var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsSource() { - Name = "nrk", - Attribution = "Fra https://nrk.no", - Created = source.CacheFileCreatedAt.DateTime, - CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", - Articles = new List<NewsArticle>() - }; - - foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { - var article = new NewsArticle { - Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), - Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() + return await _memoryCache.GetOrCreateAsync(NrkPrefix, async entry => { + entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(31); + var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsSource() { + Name = "nrk", + Attribution = "Fra https://nrk.no", + Created = source.CacheFileCreatedAt.DateTime, + CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", + Articles = new List<NewsArticle>() }; - if ( - article.Href.IsNullOrWhiteSpace() - || article.Title.IsNullOrWhiteSpace() - || (!article.Href?.StartsWith("https://www.nrk.no") ?? true) - || (article.Href?.StartsWith("https://www.nrk.no/mat") ?? false) - ) { - continue; - } + foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { + var article = new NewsArticle { + Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), + Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() + }; - result.Articles.Add(article); - } + if (article.Href.IsNullOrWhiteSpace() || article.Title.IsNullOrWhiteSpace() || !IsSupportedNrkUrl(article.Href)) { + continue; + } + + result.Articles.Add(article); + } - return result; + return result; + }); } private class SourceResult |
