diff options
| author | ivarlovlie <git@ivarlovlie.no> | 2022-11-15 05:05:05 +0100 |
|---|---|---|
| committer | ivarlovlie <git@ivarlovlie.no> | 2022-11-15 05:05:05 +0100 |
| commit | add94527050dc311c4ad117e25dd5e4517b3b887 (patch) | |
| tree | 137306ea8c7c63b6287bd77c6deaf09e1319833e /src/Services/GrabberService.cs | |
| download | lettnytt-add94527050dc311c4ad117e25dd5e4517b3b887.tar.xz lettnytt-add94527050dc311c4ad117e25dd5e4517b3b887.zip | |
feat: Initial commit
Diffstat (limited to 'src/Services/GrabberService.cs')
| -rw-r--r-- | src/Services/GrabberService.cs | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/src/Services/GrabberService.cs b/src/Services/GrabberService.cs new file mode 100644 index 0000000..814f0d3 --- /dev/null +++ b/src/Services/GrabberService.cs @@ -0,0 +1,147 @@ +using System.Security.Cryptography; +using System.Text; +using AngleSharp.Html.Parser; +using I2R.LightNews.Utilities; + +namespace I2R.LightNews.Services; + +public class GrabberService +{ + private readonly ILogger<GrabberService> _logger; + private readonly HttpClient _http; + private const string NrkPrefix = "nrkno"; + private const int StaleTime = 100800; + + private static AppPath _cachePath => new() { + HostPath = "AppData/__sitecache" + }; + + public GrabberService(ILogger<GrabberService> logger, HttpClient http) { + _logger = logger; + _http = http; + } + + public async Task<NewsArticle> GrabNrkArticleAsync(string url) { + var strippedUrl = url.Replace("https://", "") + .Replace("http://", "") + .Replace("www.", ""); + + if (!strippedUrl.StartsWith("nrk.no") + || strippedUrl.StartsWith("nrk.no/mat") + || strippedUrl.StartsWith("nrk.no/tv") + || strippedUrl.StartsWith("nrk.no/radio") + || strippedUrl.StartsWith("nrk.no/xl") + ) return default; + + using var md5 = MD5.Create(); + var articleFilePrefix = "art-" + NrkPrefix + "-" + Convert.ToHexString(md5.ComputeHash(Encoding.UTF8.GetBytes(url))); + var source = await GrabSourceAsync(url, articleFilePrefix, true); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsArticle() { + CachedAt = source.CacheFileCreatedAt, + Href = url, + Title = doc.QuerySelector("h1.title")?.TextContent, + Subtitle = doc.QuerySelector(".article-lead p")?.TextContent, + Authors = new List<NewsArticle.Author>() + }; + + foreach (var authorNode in doc.QuerySelectorAll(".authors .author")) { + var author = new NewsArticle.Author() { + Name = authorNode.QuerySelector(".author__name")?.TextContent, + Title = authorNode.QuerySelector(".author__role")?.TextContent + }; + result.Authors.Add(author); + } + + DateTime.TryParse(doc.QuerySelector("time.datePublished")?.Attributes["datetime"]?.Value, out var published); + DateTime.TryParse(doc.QuerySelector("time.dateModified")?.Attributes["datetime"]?.Value, out var modified); + result.UpdatedAt = modified; + result.PublishedAt = published; + result.Content = HtmlSanitiser.SanitizeHtmlFragment(doc.QuerySelector(".article-body").InnerHtml, "img,a,.video-reference,.image-reference,.reference"); + return result; + } + + public async Task<NewsSource> GrabNrkAsync() { + var source = await GrabSourceAsync("https://nrk.no", NrkPrefix); + var parser = new HtmlParser(); + var doc = await parser.ParseDocumentAsync(source.Content); + var result = new NewsSource() { + Name = "nrk", + Attribution = "Fra https://nrk.no", + Created = source.CacheFileCreatedAt.DateTime, + CanonicalUrl = doc.QuerySelector("link[rel='canonical']")?.Attributes["href"]?.Value ?? "uvisst", + Articles = new List<NewsArticle>() + }; + + foreach (var articleAnchorNode in doc.QuerySelectorAll("main section a")) { + var article = new NewsArticle { + Href = articleAnchorNode.Attributes["href"]?.Value.Trim(), + Title = articleAnchorNode.QuerySelector(".kur-room__title span")?.TextContent.Trim() + }; + + if ( + article.Href.IsNullOrWhiteSpace() + || article.Title.IsNullOrWhiteSpace() + || (!article.Href?.StartsWith("https://www.nrk.no") ?? true) + || (article.Href?.StartsWith("https://www.nrk.no/mat") ?? false) + ) { + continue; + } + + result.Articles.Add(article); + } + + return result; + } + + private class SourceResult + { + public string CacheFileName { get; set; } + public string Content { get; set; } + public DateTimeOffset CacheFileCreatedAt { get; set; } + } + + private async Task<SourceResult> GrabSourceAsync(string url, string prefix, bool forceRefresh = false) { + var cacheFileName = forceRefresh ? default : GetLatestCacheFile(prefix); + if (cacheFileName != default) { + _logger.LogInformation("Returned cached {0} file, filename: {1}", url, cacheFileName.CacheFileName); + cacheFileName.Content = await File.ReadAllTextAsync(_cachePath.GetHostPathForFilename(cacheFileName.CacheFileName)); + return cacheFileName; + } + + var sourceResponse = await _http.GetAsync(url); + var sourceContent = await sourceResponse.Content.ReadAsStringAsync(); + var utcNow = DateTimeOffset.UtcNow; + var newCacheFileName = prefix + "-" + utcNow.ToUnixTimeSeconds() + ".html"; + await File.WriteAllTextAsync(_cachePath.GetHostPathForFilename(newCacheFileName), sourceContent); + _logger.LogInformation("Wrote new cache file for {0}, filename: {1}", url, newCacheFileName); + return new SourceResult() { + CacheFileName = newCacheFileName, + CacheFileCreatedAt = utcNow, + Content = sourceContent + }; + } + + private SourceResult GetLatestCacheFile(string prefix) { + var cacheDirectoryInfo = new DirectoryInfo(_cachePath.HostPath); + if (!cacheDirectoryInfo.Exists) { + cacheDirectoryInfo.Create(); + return default; + } + + var files = cacheDirectoryInfo.GetFiles(); + if (!files.Any()) return default; + var relevantFiles = files.Where(c => c.Name.StartsWith(prefix)).OrderBy(c => c.Name).ToList(); + if (!relevantFiles.Any()) return default; + var mostRecentFileName = relevantFiles.Last().Name; + var mostRecentEpochString = new string(mostRecentFileName.Skip(mostRecentFileName.LastIndexOf('-')).Where(Char.IsDigit).ToArray()); + long.TryParse(mostRecentEpochString, out var mostRecentEpochLong); + // more than 30 minutes since last grab + if (mostRecentEpochLong + StaleTime < DateTimeOffset.UtcNow.ToUnixTimeSeconds()) return default; + return new SourceResult { + CacheFileName = mostRecentFileName, + CacheFileCreatedAt = DateTimeOffset.FromUnixTimeSeconds(mostRecentEpochLong) + }; + } +}
\ No newline at end of file |
