Woke up to see a discussion about torrentio scraping: powered by community

Was a little inspired. Now we have a database (self populating) of imdb id's - why shouldn't we actually have the ability to scrape any other instance of torrentio, or knightcrawler? Also restructured the producer to be vertically sliced to make it easier to work with Too much flicking back and forth between Jobs and Crawlers when configuring
2024-12-20 03:29:51 +00:00 · 2024-03-02 18:41:57 +00:00
parent 98115e0cf7
commit 95fa48c851
59 changed files with 733 additions and 261 deletions
--- a/src/producer/Features/Crawlers/Dmm/DebridMediaManagerCrawler.cs
+++ b/src/producer/Features/Crawlers/Dmm/DebridMediaManagerCrawler.cs
@@ -0,0 +1,146 @@
+namespace Producer.Features.Crawlers.Dmm;
+
+public partial class DebridMediaManagerCrawler(
+    IHttpClientFactory httpClientFactory,
+    ILogger<DebridMediaManagerCrawler> logger,
+    IDataStorage storage,
+    GithubConfiguration githubConfiguration) : BaseCrawler(logger, storage)
+{
+    [GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
+    private static partial Regex HashCollectionMatcher();
+    
+    [GeneratedRegex(@"[sS]([0-9]{1,2})|seasons?[\s-]?([0-9]{1,2})", RegexOptions.IgnoreCase, "en-GB")]
+    private static partial Regex SeasonMatcher();
+
+    private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
+    
+    protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
+    protected override string Url => "https://api.github.com/repos/debridmediamanager/hashlists/git/trees/main?recursive=1";
+    protected override string Source => "DMM";
+
+    public override async Task Execute()
+    {
+        var client = httpClientFactory.CreateClient("Scraper");
+        client.DefaultRequestHeaders.Authorization = new("Bearer", githubConfiguration.PAT);
+        client.DefaultRequestHeaders.UserAgent.ParseAdd("curl");
+        
+        var jsonBody = await client.GetStringAsync(Url);
+        
+        var json =  JsonDocument.Parse(jsonBody);
+
+        var entriesArray = json.RootElement.GetProperty("tree");
+        
+        logger.LogInformation("Found {Entries} total DMM pages", entriesArray.GetArrayLength());
+
+        foreach (var entry in entriesArray.EnumerateArray())
+        {
+            await ParsePage(entry, client);
+        }
+    }
+
+    private async Task ParsePage(JsonElement entry, HttpClient client)
+    {
+        var (pageIngested, name) = await IsAlreadyIngested(entry);
+        
+        if (string.IsNullOrEmpty(name) || pageIngested)
+        {
+            return;
+        }
+        
+        var pageSource = await client.GetStringAsync($"{DownloadBaseUrl}/{name}");
+        
+        await ExtractPageContents(pageSource, name);
+    }
+
+    private async Task ExtractPageContents(string pageSource, string name)
+    {
+        var match = HashCollectionMatcher().Match(pageSource);
+        
+        if (!match.Success)
+        {
+            logger.LogWarning("Failed to match hash collection for {Name}", name);
+            await Storage.MarkPageAsIngested(name);
+            return;
+        }
+
+        var encodedJson = match.Groups.Values.ElementAtOrDefault(1);
+        
+        if (string.IsNullOrEmpty(encodedJson?.Value))
+        {
+            logger.LogWarning("Failed to extract encoded json for {Name}", name);
+            return;
+        }
+        
+        await ProcessExtractedContentsAsTorrentCollection(encodedJson.Value, name);
+    }
+
+    private async Task ProcessExtractedContentsAsTorrentCollection(string encodedJson, string name)
+    {
+        var decodedJson = LZString.DecompressFromEncodedURIComponent(encodedJson);
+        
+        var json = JsonDocument.Parse(decodedJson);
+
+        await InsertTorrentsForPage(json);
+        
+        var result = await Storage.MarkPageAsIngested(name);
+        
+        if (!result.Success)
+        {
+            logger.LogWarning("Failed to mark page as ingested: [{Error}]", result.ErrorMessage);
+            return;
+        }
+        
+        logger.LogInformation("Successfully marked page as ingested");
+    }
+
+    private Torrent? ParseTorrent(JsonElement item)
+    {
+        var torrent = new Torrent
+        {
+            Source = Source,
+            Name = item.GetProperty("filename").GetString(),
+            Size = item.GetProperty("bytes").GetInt64().ToString(),
+            InfoHash = item.GetProperty("hash").ToString(),
+            Seeders = 0,
+            Leechers = 0,
+        };
+
+        if (string.IsNullOrEmpty(torrent.Name))
+        {
+            return null;
+        }
+        
+        torrent.Category = SeasonMatcher().IsMatch(torrent.Name) ? "tv" : "movies";
+
+        return torrent;
+    }
+
+    private async Task InsertTorrentsForPage(JsonDocument json)
+    {
+        var torrents = json.RootElement.EnumerateArray()
+            .Select(ParseTorrent)
+            .ToList();
+        
+        if (torrents.Count == 0)
+        {
+            logger.LogWarning("No torrents found in {Source} response", Source);
+            return;
+        }
+        
+        await InsertTorrents(torrents!);
+    }
+    
+    private async Task<(bool Success, string? Name)> IsAlreadyIngested(JsonElement entry)
+    {
+        var name = entry.GetProperty("path").GetString();
+
+        if (string.IsNullOrEmpty(name))
+        {
+            return (false, null);
+        }
+        
+        var pageIngested = await Storage.PageIngested(name);
+
+        return (pageIngested, name);
+    }
+}