Big rewrite - distributed consumers for ingestion / scraping(scalable) - single producer written in c#.

Changed from page scraping to rss xml scraping Includes RealDebridManager hashlist decoding (requires a github readonly PAT as requests must be authenticated) - This allows ingestion of 200k+ entries in a few hours. Simplifies a lot of torrentio to deal with new data
2024-12-20 03:29:51 +00:00 · 2024-02-01 16:38:45 +00:00
parent 6fb4ddcf23
commit ab17ef81be
255 changed files with 18489 additions and 69074 deletions
--- a/src/producer/Crawlers/BaseCrawler.cs
+++ b/src/producer/Crawlers/BaseCrawler.cs
@@ -0,0 +1,25 @@
+namespace Scraper.Crawlers;
+
+public abstract class BaseCrawler(ILogger<BaseCrawler> logger, IDataStorage storage) : ICrawler
+{
+    protected abstract IReadOnlyDictionary<string, string> Mappings { get; }
+    protected abstract string Url { get; }
+    protected abstract string Source { get; }
+    protected IDataStorage Storage => storage;
+
+    public virtual Task Execute() => Task.CompletedTask;
+
+    protected async Task<InsertTorrentResult> InsertTorrents(IReadOnlyCollection<Torrent> torrent)
+    {
+        var result = await storage.InsertTorrents(torrent);
+        
+        if (!result.Success)
+        {
+            logger.LogWarning("Ingestion Failed: [{Error}]", result.ErrorMessage);
+            return result;
+        }
+            
+        logger.LogInformation("Ingestion Successful - Wrote {Count} new torrents", result.InsertedCount);
+        return result;
+    }
+}