From 623a45d1ec1f7e636defd139b35b615b1a64af91 Mon Sep 17 00:00:00 2001 From: ivarlovlie Date: Fri, 2 Dec 2022 12:04:42 +0900 Subject: feat: !WIP nrk radio --- src/Services/GrabberService.cs | 193 ---------------------------------------- src/Services/NrkNewsService.cs | 193 ++++++++++++++++++++++++++++++++++++++++ src/Services/NrkRadioService.cs | 31 +++++++ 3 files changed, 224 insertions(+), 193 deletions(-) delete mode 100644 src/Services/GrabberService.cs create mode 100644 src/Services/NrkNewsService.cs create mode 100644 src/Services/NrkRadioService.cs (limited to 'src/Services') diff --git a/src/Services/GrabberService.cs b/src/Services/GrabberService.cs deleted file mode 100644 index d6650a2..0000000 --- a/src/Services/GrabberService.cs +++ /dev/null @@ -1,193 +0,0 @@ -using System.Security.Cryptography; -using System.Text; -using AngleSharp.Html.Parser; -using I2R.LightNews.Utilities; -using Microsoft.Extensions.Caching.Memory; - -namespace I2R.LightNews.Services; - -public class GrabberService -{ - private readonly ILogger _logger; - private readonly IMemoryCache _memoryCache; - private readonly HttpClient _http; - private const string NrkPrefix = "nrkno"; - private const int StaleTime = 1800; - - private static AppPath _cachePath => new() { - HostPath = "AppData/__sitecache" - }; - - public GrabberService(ILogger logger, HttpClient http, IMemoryCache memoryCache) { - _logger = logger; - _http = http; - _memoryCache = memoryCache; - } - - private bool IsSupportedNrkUrl(string url) { - var strippedUrl = url.Replace("https://", "") - .Replace("http://", "") - .Replace("www.", ""); - - var ignored = new List() { - "nrk.no/mat", - "nrk.no/radio", - "nrk.no/tv", - "nrk.no/video", - "nrk.no/podkast" - }; - - return strippedUrl.StartsWith("nrk.no") && ignored.All(c => !strippedUrl.Contains(c)); - } - - public async Task GrabNrkArticleAsync(string url) { - if (!IsSupportedNrkUrl(url)) return default; - using var md5 = MD5.Create(); - var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); - return await _memoryCache.GetOrCreateAsync(articleFilePrefix, async entry => { - entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); - var source = await GrabSourceAsync(url, articleFilePrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsArticle() { - CachedAt = source.CacheFileCreatedAt, - Href = url, - Title = doc.QuerySelector("h1.title")?.TextContent, - Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, - Authors = new List() - }; - - foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { - var author = new NewsArticle.Author() { - Name = authorNode.QuerySelector(".author__name")?.TextContent, - Title = authorNode.QuerySelector(".author__role")?.TextContent - }; - result.Authors.Add(author); - } - - DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); - DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); - - result.UpdatedAt = modified; - result.PublishedAt = published; - - var defaultExcludes = new List() { - ".dhks-background", - ".dhks-actions", - ".dhks-credits", - ".dhks-sticky-reset", - ".dhks-byline", - ".compilation-reference", - ".section-reference", - ".image", - ".fact__expand", - ".image-reference", - ".video-reference", - ".article-body--updating", - ".external-reference", - ".reference", - ".atlas-reference", - ".remoterenderedcontent-reference", - "text:Følg utviklingen i NRKs Nyhetssenter", - "text:Bli med i debatten under" - }; - - if (doc.QuerySelector("kortstokk-app") != default) { - result.Title = doc.QuerySelector(".dhks-title span")?.TextContent; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', defaultExcludes)); - } else if (url.Contains("/xl/")) { - var subtitle = doc.QuerySelector(".article-feature__intro p").InnerHtml; - result.Title = doc.QuerySelector(".article-feature__intro h1").TextContent; - var contentHtml = doc.QuerySelector(".article-feature__body").InnerHtml; - result.Content = HtmlSanitiser.SanitizeHtmlFragment(subtitle + contentHtml, string.Join(',', defaultExcludes)); - } else if (url.Contains("nrk.no/nyheter") || doc.QuerySelector(".bulletin-text") != default) { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".bulletin-text").InnerHtml); - } else { - result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', defaultExcludes)); - } - - return result; - }); - } - - public async Task GrabNrkAsync() { - return await _memoryCache.GetOrCreateAsync(NrkPrefix, async entry => { - entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(31); - var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); - var parser = new HtmlParser(); - var doc = await parser.ParseDocumentAsync(source.Content); - var result = new NewsSource() { - Name = "nrk", - Attribution = "Fra https://nrk.no", - Created = source.CacheFileCreatedAt.DateTime, - CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", - Articles = new List() - }; - - foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { - var article = new NewsArticle { - Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), - Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() - }; - - if (article.Href.IsNullOrWhiteSpace() || article.Title.IsNullOrWhiteSpace() || !IsSupportedNrkUrl(article.Href)) { - continue; - } - - result.Articles.Add(article); - } - - return result; - }); - } - - private class SourceResult - { - public string CacheFileName { get; set; } - public string Content { get; set; } - public DateTimeOffset CacheFileCreatedAt { get; set; } - } - - private async Task GrabSourceAsync(string url, string prefix, bool forceRefresh = false) { - var cacheFileName = forceRefresh ? default : GetLatestCacheFile(prefix); - if (cacheFileName != default) { - _logger.LogInformation("Returned cached {0} file, filename: {1}", url, cacheFileName.CacheFileName); - cacheFileName.Content = await File.ReadAllTextAsync(_cachePath.GetHostPathForFilename(cacheFileName.CacheFileName)); - return cacheFileName; - } - - var sourceResponse = await _http.GetAsync(url); - var sourceContent = await sourceResponse.Content.ReadAsStringAsync(); - var utcNow = DateTimeOffset.UtcNow; - var newCacheFileName = prefix + "-" + utcNow.ToUnixTimeSeconds() + ".html"; - await File.WriteAllTextAsync(_cachePath.GetHostPathForFilename(newCacheFileName), sourceContent); - _logger.LogInformation("Wrote new cache file for {0}, filename: {1}", url, newCacheFileName); - return new SourceResult() { - CacheFileName = newCacheFileName, - CacheFileCreatedAt = utcNow, - Content = sourceContent - }; - } - - private SourceResult GetLatestCacheFile(string prefix) { - var cacheDirectoryInfo = new DirectoryInfo(_cachePath.HostPath); - if (!cacheDirectoryInfo.Exists) { - cacheDirectoryInfo.Create(); - return default; - } - - var files = cacheDirectoryInfo.GetFiles(); - if (!files.Any()) return default; - var relevantFiles = files.Where(c => c.Name.StartsWith(prefix)).OrderBy(c => c.Name).ToList(); - if (!relevantFiles.Any()) return default; - var mostRecentFileName = relevantFiles.Last().Name; - var mostRecentEpochString = new string(mostRecentFileName.Skip(mostRecentFileName.LastIndexOf('-')).Where(Char.IsDigit).ToArray()); - long.TryParse(mostRecentEpochString, out var mostRecentEpochLong); - // more than 30 minutes since last grab - if (mostRecentEpochLong + StaleTime < DateTimeOffset.UtcNow.ToUnixTimeSeconds()) return default; - return new SourceResult { - CacheFileName = mostRecentFileName, - CacheFileCreatedAt = DateTimeOffset.FromUnixTimeSeconds(mostRecentEpochLong) - }; - } -} \ No newline at end of file diff --git a/src/Services/NrkNewsService.cs b/src/Services/NrkNewsService.cs new file mode 100644 index 0000000..df9d64e --- /dev/null +++ b/src/Services/NrkNewsService.cs @@ -0,0 +1,193 @@ +using System.Security.Cryptography; +using System.Text; +using AngleSharp.Html.Parser; +using I2R.LightNews.Utilities; +using Microsoft.Extensions.Caching.Memory; + +namespace I2R.LightNews.Services; + +public class NrkNewsService +{ + private readonly ILogger _logger; + private readonly IMemoryCache _memoryCache; + private readonly HttpClient _http; + private const string NrkPrefix = "nrkno"; + private const int StaleTime = 1800; + + private static AppPath _cachePath => new() { + HostPath = "AppData/__sitecache" + }; + + public NrkNewsService(ILogger logger, HttpClient http, IMemoryCache memoryCache) { + _logger = logger; + _http = http; + _memoryCache = memoryCache; + } + + private bool IsSupportedNrkUrl(string url) { + var strippedUrl = url.Replace("https://", "") + .Replace("http://", "") + .Replace("www.", ""); + + var ignored = new List() { + "nrk.no/mat", + "nrk.no/radio", + "nrk.no/tv", + "nrk.no/video", + "nrk.no/podkast" + }; + + return strippedUrl.StartsWith("nrk.no") && ignored.All(c => !strippedUrl.Contains(c)); + } + + public async Task GrabNrkArticleAsync(string url) { + if (!IsSupportedNrkUrl(url)) return default; + using var md5 = MD5.Create(); + var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); + return await _memoryCache.GetOrCreateAsync(articleFilePrefix, async entry => { + entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(10); + var source = await GrabSourceAsync(url, articleFilePrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsArticle() { + CachedAt = source.CacheFileCreatedAt, + Href = url, + Title = doc.QuerySelector("h1.title")?.TextContent, + Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, + Authors = new List() + }; + + foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { + var author = new NewsArticle.Author() { + Name = authorNode.QuerySelector(".author__name")?.TextContent, + Title = authorNode.QuerySelector(".author__role")?.TextContent + }; + result.Authors.Add(author); + } + + DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); + DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); + + result.UpdatedAt = modified; + result.PublishedAt = published; + + var defaultExcludes = new List() { + ".dhks-background", + ".dhks-actions", + ".dhks-credits", + ".dhks-sticky-reset", + ".dhks-byline", + ".compilation-reference", + ".section-reference", + ".image", + ".fact__expand", + ".image-reference", + ".video-reference", + ".article-body--updating", + ".external-reference", + ".reference", + ".atlas-reference", + ".remoterenderedcontent-reference", + "text:Følg utviklingen i NRKs Nyhetssenter", + "text:Bli med i debatten under" + }; + + if (doc.QuerySelector("kortstokk-app") != default) { + result.Title = doc.QuerySelector(".dhks-title span")?.TextContent; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".dhks-cardSection").InnerHtml, string.Join(',', defaultExcludes)); + } else if (url.Contains("/xl/")) { + var subtitle = doc.QuerySelector(".article-feature__intro p").InnerHtml; + result.Title = doc.QuerySelector(".article-feature__intro h1").TextContent; + var contentHtml = doc.QuerySelector(".article-feature__body").InnerHtml; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(subtitle + contentHtml, string.Join(',', defaultExcludes)); + } else if (url.Contains("nrk.no/nyheter") || (doc.QuerySelector(".bulletin-text") != default && doc.QuerySelector(".article-body") == defaultExcludes)) { + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".bulletin-text").InnerHtml); + } else { + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, string.Join(',', defaultExcludes)); + } + + return result; + }); + } + + public async Task GrabNrkAsync() { + return await _memoryCache.GetOrCreateAsync(NrkPrefix, async entry => { + entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(31); + var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsSource() { + Name = "nrk", + Attribution = "Fra https://nrk.no", + Created = source.CacheFileCreatedAt.DateTime, + CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", + Articles = new List() + }; + + foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { + var article = new NewsArticle { + Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), + Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() + }; + + if (article.Href.IsNullOrWhiteSpace() || article.Title.IsNullOrWhiteSpace() || !IsSupportedNrkUrl(article.Href)) { + continue; + } + + result.Articles.Add(article); + } + + return result; + }); + } + + private class SourceResult + { + public string CacheFileName { get; set; } + public string Content { get; set; } + public DateTimeOffset CacheFileCreatedAt { get; set; } + } + + private async Task GrabSourceAsync(string url, string prefix, bool forceRefresh = false) { + var cacheFileName = forceRefresh ? default : GetLatestCacheFile(prefix); + if (cacheFileName != default) { + _logger.LogInformation("Returned cached {0} file, filename: {1}", url, cacheFileName.CacheFileName); + cacheFileName.Content = await File.ReadAllTextAsync(_cachePath.GetHostPathForFilename(cacheFileName.CacheFileName)); + return cacheFileName; + } + + var sourceResponse = await _http.GetAsync(url); + var sourceContent = await sourceResponse.Content.ReadAsStringAsync(); + var utcNow = DateTimeOffset.UtcNow; + var newCacheFileName = prefix + "-" + utcNow.ToUnixTimeSeconds() + ".html"; + await File.WriteAllTextAsync(_cachePath.GetHostPathForFilename(newCacheFileName), sourceContent); + _logger.LogInformation("Wrote new cache file for {0}, filename: {1}", url, newCacheFileName); + return new SourceResult() { + CacheFileName = newCacheFileName, + CacheFileCreatedAt = utcNow, + Content = sourceContent + }; + } + + private SourceResult GetLatestCacheFile(string prefix) { + var cacheDirectoryInfo = new DirectoryInfo(_cachePath.HostPath); + if (!cacheDirectoryInfo.Exists) { + cacheDirectoryInfo.Create(); + return default; + } + + var files = cacheDirectoryInfo.GetFiles(); + if (!files.Any()) return default; + var relevantFiles = files.Where(c => c.Name.StartsWith(prefix)).OrderBy(c => c.Name).ToList(); + if (!relevantFiles.Any()) return default; + var mostRecentFileName = relevantFiles.Last().Name; + var mostRecentEpochString = new string(mostRecentFileName.Skip(mostRecentFileName.LastIndexOf('-')).Where(Char.IsDigit).ToArray()); + long.TryParse(mostRecentEpochString, out var mostRecentEpochLong); + // more than 30 minutes since last grab + if (mostRecentEpochLong + StaleTime < DateTimeOffset.UtcNow.ToUnixTimeSeconds()) return default; + return new SourceResult { + CacheFileName = mostRecentFileName, + CacheFileCreatedAt = DateTimeOffset.FromUnixTimeSeconds(mostRecentEpochLong) + }; + } +} \ No newline at end of file diff --git a/src/Services/NrkRadioService.cs b/src/Services/NrkRadioService.cs new file mode 100644 index 0000000..a2889ce --- /dev/null +++ b/src/Services/NrkRadioService.cs @@ -0,0 +1,31 @@ +using Microsoft.Extensions.Caching.Memory; + +namespace I2R.LightNews.Services; + +public class NrkRadioService +{ + private readonly IMemoryCache _cache; + private readonly HttpClient _http; + private const string CATEGORY_SEARCH_CACHE_KEY = "category_search"; + + public NrkRadioService(IMemoryCache cache, HttpClient http) { + _cache = cache; + http.BaseAddress = new Uri("https://psapi.nrk.no"); + _http = http; + } + + public async Task GetEverythingAsync() { + var path = "/radio/search/categories/alt-innhold"; + var everything = new List(); + while (path.HasValue()) { + var response = await _http.GetFromJsonAsync(path); + + } + } + + public async Task SearchCategoriesAsync(string query, int take = 50, int skip = 50) { + return await _http.GetFromJsonAsync( + "/radio/search/categories/alt-innhold?q=" + query + "&take=" + take + "&skip=" + skip + ); + } +} \ No newline at end of file -- cgit v1.3