Big rewrite - distributed consumers for ingestion / scraping(scalable) - single producer written in c#.

Changed from page scraping to rss xml scraping
Includes RealDebridManager hashlist decoding (requires a github readonly PAT as requests must be authenticated) - This allows ingestion of 200k+ entries in a few hours.
Simplifies a lot of torrentio to deal with new data
This commit is contained in:
iPromKnight
2024-02-01 16:38:45 +00:00
parent 6fb4ddcf23
commit ab17ef81be
255 changed files with 18489 additions and 69074 deletions

View File

@@ -0,0 +1,9 @@
namespace Scraper.Models;
public class GithubConfiguration
{
public const string SectionName = "GithubSettings";
public const string Filename = "github.json";
public string? PAT { get; set; }
}

View File

@@ -0,0 +1,13 @@
namespace Scraper.Models;
public class RabbitMqConfiguration
{
public const string SectionName = "RabbitMqConfiguration";
public const string Filename = "rabbitmq.json";
public string? Host { get; set; }
public string? Username { get; set; }
public string? Password { get; set; }
public string? QueueName { get; set; }
public bool Durable { get; set; }
}

View File

@@ -0,0 +1,5 @@
namespace Scraper.Models;
public record InsertTorrentResult(bool Success, int InsertedCount = 0, string? ErrorMessage = null);
public record UpdatedTorrentResult(bool Success, int UpdatedCount = 0, string? ErrorMessage = null);
public record PageIngestedResult(bool Success, string? ErrorMessage = null);

View File

@@ -0,0 +1,10 @@
namespace Scraper.Models;
public class ScrapeConfiguration
{
public const string SectionName = "ScrapeConfiguration";
public const string Filename = "scrapers.json";
public List<Scraper> Scrapers { get; set; } = [];
public string StorageConnectionString { get; set; } = "";
}

View File

@@ -0,0 +1,10 @@
namespace Scraper.Models;
public class Scraper
{
public string? Name { get; set; }
public int IntervalSeconds { get; set; } = 60;
public bool Enabled { get; set; } = true;
}

View File

@@ -0,0 +1,20 @@
namespace Scraper.Models;
// Torrent represents a crawled torrent from one of our
// supported sources.
public class Torrent
{
public long? Id { get; set; }
public string? Name { get; set; }
public string? Source { get; set; }
public string? Category { get; set; }
public string? InfoHash { get; set; }
public string? Size { get; set; }
public int Seeders { get; set; }
public int Leechers { get; set; }
public string? Imdb { get; set; }
public bool Processed { get; set; } = false;
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
}