| | | 1 | | using GistBackend.Exceptions; |
| | | 2 | | using GistBackend.Handlers.ChromaDbHandler; |
| | | 3 | | using GistBackend.Handlers.MariaDbHandler; |
| | | 4 | | using GistBackend.Handlers.RssFeedHandler; |
| | | 5 | | using GistBackend.Handlers.WebCrawlHandler; |
| | | 6 | | using GistBackend.Types; |
| | | 7 | | using GistBackend.Utils; |
| | | 8 | | using Microsoft.Extensions.Hosting; |
| | | 9 | | using Microsoft.Extensions.Logging; |
| | | 10 | | using Microsoft.Extensions.Options; |
| | | 11 | | using Prometheus; |
| | | 12 | | using static GistBackend.Utils.LogEvents; |
| | | 13 | | using Summary = Prometheus.Summary; |
| | | 14 | | |
| | | 15 | | namespace GistBackend.Services; |
| | | 16 | | |
| | 15 | 17 | | public class CleanupService( |
| | 15 | 18 | | IRssFeedHandler rssFeedHandler, |
| | 15 | 19 | | IGistDebouncer gistDebouncer, |
| | 15 | 20 | | IMariaDbHandler mariaDbHandler, |
| | 15 | 21 | | IChromaDbHandler chromaDbHandler, |
| | 15 | 22 | | IWebCrawlHandler webCrawlHandler, |
| | 15 | 23 | | IOptions<CleanupServiceOptions> options, |
| | 15 | 24 | | ILogger<CleanupService>? logger) |
| | | 25 | | : BackgroundService |
| | | 26 | | { |
| | 1 | 27 | | private static readonly Gauge CleanupGistsGauge = |
| | 1 | 28 | | Metrics.CreateGauge("cleanup_gists_seconds", "Time spent to cleanup gists"); |
| | 1 | 29 | | private static readonly Summary CheckGistSummary = |
| | 1 | 30 | | Metrics.CreateSummary("check_gist_seconds", "Time spent to check a gist", "feed_title"); |
| | 1 | 31 | | private static readonly Gauge GistsCheckedGauge = |
| | 1 | 32 | | Metrics.CreateGauge("gists_checked", "Number of gists checked in one run"); |
| | 15 | 33 | | private List<int> _feedsInDb = []; |
| | 15 | 34 | | private Dictionary<int, RssFeed> _feedsByFeedId = new(); |
| | | 35 | | |
| | | 36 | | protected override async Task ExecuteAsync(CancellationToken ct) |
| | | 37 | | { |
| | 15 | 38 | | while (!ct.IsCancellationRequested) |
| | | 39 | | { |
| | 15 | 40 | | var startTime = DateTime.UtcNow; |
| | 15 | 41 | | _feedsInDb = []; |
| | 15 | 42 | | _feedsByFeedId = new Dictionary<int, RssFeed>(); |
| | 30 | 43 | | using (new SelfReportingStopwatch(elapsed => CleanupGistsGauge.Set(elapsed))) |
| | | 44 | | { |
| | 15 | 45 | | await ParseFeedsAsync(ct); |
| | 15 | 46 | | await CleanupGistsAsync(ct); |
| | 14 | 47 | | } |
| | 14 | 48 | | await ServiceUtils.DelayUntilNextExecutionAsync(startTime, 15, logger, ct); |
| | | 49 | | } |
| | 0 | 50 | | } |
| | | 51 | | |
| | | 52 | | private async Task ParseFeedsAsync(CancellationToken ct) |
| | | 53 | | { |
| | 81 | 54 | | foreach (var feed in rssFeedHandler.Definitions) await ParseAndCacheFeedAsync(feed, ct); |
| | 15 | 55 | | } |
| | | 56 | | |
| | | 57 | | private async Task ParseAndCacheFeedAsync(RssFeed feed, CancellationToken ct) |
| | | 58 | | { |
| | 17 | 59 | | using var _ = logger?.BeginScope(new Dictionary<string, object> { ["RssUrl"] = feed.RssUrl }); |
| | | 60 | | try |
| | | 61 | | { |
| | 17 | 62 | | await rssFeedHandler.ParseFeedAsync(feed, ct); |
| | 17 | 63 | | var feedInfo = await mariaDbHandler.GetFeedInfoByRssUrlAsync(feed.RssUrl, ct); |
| | 17 | 64 | | if (feedInfo is null) |
| | | 65 | | { |
| | 2 | 66 | | logger?.LogWarning(DidNotFindExpectedFeedInDb, "Could not find feed in db: {RssUrl}", feed.RssUrl); |
| | | 67 | | } |
| | | 68 | | else |
| | | 69 | | { |
| | 15 | 70 | | _feedsInDb.Add(feedInfo.Id!.Value); |
| | 15 | 71 | | feed.ParseEntries(feedInfo.Id!.Value); |
| | | 72 | | |
| | 15 | 73 | | _feedsByFeedId.Add(feedInfo.Id!.Value, feed); |
| | | 74 | | } |
| | 17 | 75 | | } |
| | 0 | 76 | | catch (ParsingFeedException e) |
| | | 77 | | { |
| | 0 | 78 | | logger?.LogWarning(ParsingFeedFailed, e, "Skipping feed, failed to parse RSS feed from {RssUrl}", |
| | 0 | 79 | | feed.RssUrl); |
| | 0 | 80 | | } |
| | 17 | 81 | | } |
| | | 82 | | |
| | | 83 | | private async Task CleanupGistsAsync(CancellationToken ct) |
| | | 84 | | { |
| | 15 | 85 | | var allGists = await mariaDbHandler.GetAllGistsAsync(ct); |
| | 91 | 86 | | var readyGists = allGists.Where(gist => gistDebouncer.IsReady(gist.Id!.Value, gist.Updated)).ToList(); |
| | 230 | 87 | | foreach (var gist in readyGists) await CheckGistAsync(gist, ct); |
| | 14 | 88 | | GistsCheckedGauge.Set(readyGists.Count); |
| | 14 | 89 | | } |
| | | 90 | | |
| | | 91 | | private async Task CheckGistAsync(Gist gist, CancellationToken ct) |
| | | 92 | | { |
| | | 93 | | try |
| | | 94 | | { |
| | 67 | 95 | | if (!_feedsByFeedId.ContainsKey(gist.FeedId)) |
| | | 96 | | { |
| | 1 | 97 | | if (!_feedsInDb.Contains(gist.FeedId)) |
| | | 98 | | { |
| | 1 | 99 | | throw new FeedNotFoundException($"Feed with ID {gist.FeedId} not found"); |
| | | 100 | | } |
| | 0 | 101 | | return; |
| | | 102 | | } |
| | 66 | 103 | | var shouldBeDisabled = await GistShouldBeDisabledAsync(gist, ct); |
| | 66 | 104 | | await mariaDbHandler.EnsureCorrectDisabledStateForGistAsync(gist.Id!.Value, shouldBeDisabled, ct); |
| | 66 | 105 | | await chromaDbHandler.EnsureGistHasCorrectMetadataAsync(gist, shouldBeDisabled, ct); |
| | 66 | 106 | | } |
| | 1 | 107 | | catch (Exception e) when (e is ExternalServiceException or HttpRequestException) |
| | | 108 | | { |
| | 0 | 109 | | logger?.LogError(FetchingPageContentFailed, e, "Skipping gist, failed to fetch page content for {Url}", |
| | 0 | 110 | | gist.Url.AbsoluteUri); |
| | 0 | 111 | | } |
| | 66 | 112 | | } |
| | | 113 | | |
| | | 114 | | private async Task<bool> GistShouldBeDisabledAsync(Gist gist, CancellationToken ct) |
| | | 115 | | { |
| | 86 | 116 | | if (options.Value.DomainsToIgnore.Any(domain => gist.Url.Host.Equals(domain))) return false; |
| | 61 | 117 | | var feed = GetFeedByFeedId(gist.FeedId); |
| | 61 | 118 | | var feedTitle = feed.Title ?? throw new InvalidOperationException($"Feed with ID {feed.Id} has no title"); |
| | 122 | 119 | | using (new SelfReportingStopwatch(elapsed => CheckGistSummary.WithLabels(feedTitle).Observe(elapsed))) |
| | | 120 | | { |
| | 61 | 121 | | var response = await webCrawlHandler.FetchAsync(gist.Url.AbsoluteUri, ct); |
| | 81 | 122 | | if (response.Status is >= 400 and < 500) return true; // not available anymore |
| | 42 | 123 | | if (WasRedirectedAndNotPresentInFeedAnymore(gist, response.Redirected)) return true; |
| | 45 | 124 | | if (feed.CheckForPaywall(response.Content)) return true; |
| | 35 | 125 | | return false; |
| | | 126 | | } |
| | 66 | 127 | | } |
| | | 128 | | |
| | | 129 | | private RssFeed GetFeedByFeedId(int feedId) => |
| | 61 | 130 | | !_feedsByFeedId.TryGetValue(feedId, out var feed) |
| | 61 | 131 | | ? throw new FeedNotFoundException($"Feed with ID {feedId} not found") |
| | 61 | 132 | | : feed; |
| | | 133 | | |
| | | 134 | | private bool WasRedirectedAndNotPresentInFeedAnymore(Gist gist, bool redirected) |
| | | 135 | | { |
| | 166 | 136 | | var isPresentInFeed = _feedsByFeedId[gist.FeedId].Entries!.Any(entry => entry.Url == gist.Url); |
| | 41 | 137 | | return redirected && !isPresentInFeed; |
| | | 138 | | } |
| | | 139 | | } |