diff options
| author | ivarlovlie <git@ivarlovlie.no> | 2022-12-02 04:04:42 +0100 |
|---|---|---|
| committer | ivarlovlie <git@ivarlovlie.no> | 2022-12-02 04:04:42 +0100 |
| commit | 623a45d1ec1f7e636defd139b35b615b1a64af91 (patch) | |
| tree | 0e5c2d5f1e96cd6f4adb305ed3f35dd02f2485ee /src/Services/GrabberService.cs | |
| parent | a453135b470565c56df2fd319dc927db67e299c6 (diff) | |
| download | lettnytt-623a45d1ec1f7e636defd139b35b615b1a64af91.tar.xz lettnytt-623a45d1ec1f7e636defd139b35b615b1a64af91.zip | |
feat: !WIP nrk radio
Diffstat (limited to 'src/Services/GrabberService.cs')
| -rw-r--r-- | src/Services/GrabberService.cs | 193 |
1 files changed, 0 insertions, 193 deletions
diff --git a/src/Services/GrabberService.cs b/src/Services/GrabberService.cs deleted file mode 100644 index d6650a2..0000000 --- a/src/Services/GrabberService.cs +++ /dev/null @@ -1,193 +0,0 @@ -using System.Security.Cryptography; -using System.Text; -using AngleSharp.Html.Parser; -using I2R.LightNews.Utilities; -using Microsoft.Extensions.Caching.Memory; - -namespace I2R.LightNews.Services; - -public class GrabberService -{ - private readonly ILogger<GrabberService> _logger; - private readonly IMemoryCache _memoryCache; - private readonly HttpClient _http; - private const string NrkPrefix = "nrkno"; - private const int StaleTime = 1800; - - private static AppPath _cachePath => new() { - HostPath = "AppData/__sitecache" - }; - - public GrabberService(ILogger<GrabberService> logger, HttpClient http, IMemoryCache memoryCache) { - _logger = logger; - _http = http; - _memoryCache = memoryCache; - } - - private bool IsSupportedNrkUrl(string url) { - var strippedUrl = url.Replace("https://", "") - .Replace("http://", "") - .Replace("www.", ""); - - var ignored = new List<string>() { - "nrk.no/mat", - "nrk.no/radio", - "nrk.no/tv", - "nrk.no/video", - "nrk.no/podkast" - }; - - return strippedUrl.StartsWith("nrk.no") && ignored.All(c => !strippedUrl.Contains(c)); - } - - public async Task<NewsArticle> GrabNrkArticleAsync(string url) { - if (!IsSupportedNrkUrl(url)) return default; - using var md5 = MD5.Create(); - var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); - return await _memoryCache.GetOrCreateAsync(articleFilePrefix, async entry => { - entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); - var source = await GrabSourceAsync(url, articleFilePrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsArticle() { - CachedAt = source.CacheFileCreatedAt, - Href = url, - Title = doc.QuerySelector("h1.title")?.TextContent, - Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, - Authors = new List<NewsArticle.Author>() - }; - - foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { - var author = new NewsArticle.Author() { - Name = authorNode.QuerySelector(".author__name")?.TextContent, - Title = authorNode.QuerySelector(".author__role")?.TextContent - }; - result.Authors.Add(author); - } - - DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); - DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); - - result.UpdatedAt = modified; - result.PublishedAt = published; - - var defaultExcludes = new List<string>() { - ".dhks-background", - ".dhks-actions", - ".dhks-credits", - ".dhks-sticky-reset", - ".dhks-byline", - ".compilation-reference", - ".section-reference", - ".image", - ".fact__expand", - ".image-reference", - ".video-reference", - ".article-body--updating", - ".external-reference", - ".reference", - ".atlas-reference", - ".remoterenderedcontent-reference", - "text:Følg utviklingen i NRKs Nyhetssenter", - "text:Bli med i debatten under" - }; - - if (doc.QuerySelector("kortstokk-app") != default) { - result.Title = doc.QuerySelector(".dhks-title span")?.TextContent; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', defaultExcludes)); - } else if (url.Contains("/xl/")) { - var subtitle = doc.QuerySelector(".article-feature__intro p").InnerHtml; - result.Title = doc.QuerySelector(".article-feature__intro h1").TextContent; - var contentHtml = doc.QuerySelector(".article-feature__body").InnerHtml; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(subtitle + contentHtml, string.Join(',', defaultExcludes)); - } else if (url.Contains("nrk.no/nyheter") || doc.QuerySelector(".bulletin-text") != default) { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".bulletin-text").InnerHtml); - } else { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', defaultExcludes)); - } - - return result; - }); - } - - public async Task<NewsSource> GrabNrkAsync() { - return await _memoryCache.GetOrCreateAsync(NrkPrefix, async entry => { - entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(31); - var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsSource() { - Name = "nrk", - Attribution = "Fra https://nrk.no", - Created = source.CacheFileCreatedAt.DateTime, - CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", - Articles = new List<NewsArticle>() - }; - - foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { - var article = new NewsArticle { - Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), - Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() - }; - - if (article.Href.IsNullOrWhiteSpace() || article.Title.IsNullOrWhiteSpace() || !IsSupportedNrkUrl(article.Href)) { - continue; - } - - result.Articles.Add(article); - } - - return result; - }); - } - - private class SourceResult - { - public string CacheFileName { get; set; } - public string Content { get; set; } - public DateTimeOffset CacheFileCreatedAt { get; set; } - } - - private async Task<SourceResult> GrabSourceAsync(string url, string prefix, bool forceRefresh = false) { - var cacheFileName = forceRefresh ? default : GetLatestCacheFile(prefix); - if (cacheFileName != default) { - _logger.LogInformation("Returned cached {0} file, filename: {1}", url, cacheFileName.CacheFileName); - cacheFileName.Content = await File.ReadAllTextAsync(_cachePath.GetHostPathForFilename(cacheFileName.CacheFileName)); - return cacheFileName; - } - - var sourceResponse = await _http.GetAsync(url); - var sourceContent = await sourceResponse.Content.ReadAsStringAsync(); - var utcNow = DateTimeOffset.UtcNow; - var newCacheFileName = prefix + "-" + utcNow.ToUnixTimeSeconds() + ".html"; - await File.WriteAllTextAsync(_cachePath.GetHostPathForFilename(newCacheFileName), sourceContent); - _logger.LogInformation("Wrote new cache file for {0}, filename: {1}", url, newCacheFileName); - return new SourceResult() { - CacheFileName = newCacheFileName, - CacheFileCreatedAt = utcNow, - Content = sourceContent - }; - } - - private SourceResult GetLatestCacheFile(string prefix) { - var cacheDirectoryInfo = new DirectoryInfo(_cachePath.HostPath); - if (!cacheDirectoryInfo.Exists) { - cacheDirectoryInfo.Create(); - return default; - } - - var files = cacheDirectoryInfo.GetFiles(); - if (!files.Any()) return default; - var relevantFiles = files.Where(c => c.Name.StartsWith(prefix)).OrderBy(c => c.Name).ToList(); - if (!relevantFiles.Any()) return default; - var mostRecentFileName = relevantFiles.Last().Name; - var mostRecentEpochString = new string(mostRecentFileName.Skip(mostRecentFileName.LastIndexOf('-')).Where(Char.IsDigit).ToArray()); - long.TryParse(mostRecentEpochString, out var mostRecentEpochLong); - // more than 30 minutes since last grab - if (mostRecentEpochLong + StaleTime < DateTimeOffset.UtcNow.ToUnixTimeSeconds()) return default; - return new SourceResult { - CacheFileName = mostRecentFileName, - CacheFileCreatedAt = DateTimeOffset.FromUnixTimeSeconds(mostRecentEpochLong) - }; - } -}
\ No newline at end of file |
