Woke up to see a discussion about torrentio scraping: powered by community

Was a little inspired. Now we have a database (self populating) of imdb id's - why shouldn't we actually have the ability to scrape any other instance of torrentio, or knightcrawler?

Also restructured the producer to be vertically sliced to make it easier to work with
Too much flicking back and forth between Jobs and Crawlers when configuring
This commit is contained in:
iPromKnight
2024-03-02 18:41:57 +00:00
parent 98115e0cf7
commit 95fa48c851
59 changed files with 733 additions and 261 deletions

View File

@@ -0,0 +1,25 @@
namespace Producer.Features.CrawlerSupport;
public abstract class BaseCrawler(ILogger<BaseCrawler> logger, IDataStorage storage) : ICrawler
{
protected abstract IReadOnlyDictionary<string, string> Mappings { get; }
protected abstract string Url { get; }
protected abstract string Source { get; }
protected IDataStorage Storage => storage;
public virtual Task Execute() => Task.CompletedTask;
protected async Task<InsertTorrentResult> InsertTorrents(IReadOnlyCollection<Torrent> torrent)
{
var result = await storage.InsertTorrents(torrent);
if (!result.Success)
{
logger.LogWarning("Ingestion Failed: [{Error}]", result.ErrorMessage);
return result;
}
logger.LogInformation("Ingestion Successful - Wrote {Count} new torrents", result.InsertedCount);
return result;
}
}

View File

@@ -0,0 +1,45 @@
namespace Producer.Features.CrawlerSupport;
public abstract class BaseJsonCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseJsonCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
{
private readonly HttpClient _client = httpClientFactory.CreateClient(Literals.CrawlerClient);
protected virtual async Task Execute(string collectionName)
{
logger.LogInformation("Starting {Source} crawl", Source);
using var client = httpClientFactory.CreateClient("Scraper");
var jsonBody = await _client.GetStringAsync(Url);
using var json = JsonDocument.Parse(jsonBody);
var torrents = json.RootElement.EnumerateArray()
.Select(ParseTorrent)
.Where(x => x is not null)
.ToList();
if (torrents.Count == 0)
{
logger.LogWarning("No torrents found in {Source} response", Source);
return;
}
await InsertTorrents(torrents!);
}
protected virtual void HandleInfoHash(JsonElement item, Torrent torrent, string infoHashKey)
{
if (!Mappings.ContainsKey(infoHashKey))
{
return;
}
var infoHash = item.GetProperty(Mappings[infoHashKey]).GetString();
if (infoHash is not null)
{
torrent.InfoHash = infoHash;
}
}
protected abstract Torrent? ParseTorrent(JsonElement item);
}

View File

@@ -0,0 +1,43 @@
namespace Producer.Features.CrawlerSupport;
public abstract class BaseXmlCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseXmlCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
{
public override async Task Execute()
{
logger.LogInformation("Starting {Source} crawl", Source);
using var client = httpClientFactory.CreateClient(Literals.CrawlerClient);
var xml = await client.GetStringAsync(Url);
var xmlRoot = XElement.Parse(xml);
var torrents = xmlRoot.Descendants("item")
.Select(ParseTorrent)
.Where(x => x is not null)
.ToList();
if (torrents.Count == 0)
{
logger.LogWarning("No torrents found in {Source} response", Source);
return;
}
await InsertTorrents(torrents!);
}
protected virtual void HandleInfoHash(XElement itemNode, Torrent torrent, string infoHashKey)
{
if (!Mappings.ContainsKey(infoHashKey))
{
return;
}
var infoHash = itemNode.Element(Mappings[infoHashKey])?.Value;
if (infoHash is not null)
{
torrent.InfoHash = infoHash;
}
}
protected abstract Torrent? ParseTorrent(XElement itemNode);
}

View File

@@ -0,0 +1,11 @@
namespace Producer.Features.CrawlerSupport;
public class CrawlerProvider(IServiceProvider serviceProvider) : ICrawlerProvider
{
public IEnumerable<ICrawler> GetAll() =>
serviceProvider.GetServices<ICrawler>();
public ICrawler Get(string name) =>
serviceProvider.GetRequiredKeyedService<ICrawler>(name);
}

View File

@@ -0,0 +1,6 @@
namespace Producer.Features.CrawlerSupport;
public interface ICrawler
{
Task Execute();
}

View File

@@ -0,0 +1,8 @@
namespace Producer.Features.CrawlerSupport;
public interface ICrawlerProvider
{
IEnumerable<ICrawler> GetAll();
ICrawler Get(string name);
}

View File

@@ -0,0 +1,6 @@
namespace Producer.Features.CrawlerSupport;
public static class Literals
{
public const string CrawlerClient = "Scraper";
}

View File

@@ -0,0 +1,5 @@
namespace Producer.Features.CrawlerSupport;
public record InsertTorrentResult(bool Success, int InsertedCount = 0, string? ErrorMessage = null);
public record UpdatedTorrentResult(bool Success, int UpdatedCount = 0, string? ErrorMessage = null);
public record PageIngestedResult(bool Success, string? ErrorMessage = null);

View File

@@ -0,0 +1,10 @@
namespace Producer.Features.CrawlerSupport;
public class Scraper
{
public string? Name { get; set; }
public int IntervalSeconds { get; set; } = 60;
public bool Enabled { get; set; } = true;
}

View File

@@ -0,0 +1,23 @@
namespace Producer.Features.CrawlerSupport;
internal static class ServiceCollectionExtensions
{
internal static IServiceCollection AddCrawlers(this IServiceCollection services)
{
services.AddHttpClient(Literals.CrawlerClient);
var crawlerTypes = Assembly.GetAssembly(typeof(ICrawler))
.GetTypes()
.Where(t => t is {IsClass: true, IsAbstract: false} && typeof(ICrawler).IsAssignableFrom(t));
foreach (var type in crawlerTypes)
{
services.AddKeyedTransient(typeof(ICrawler), type.Name, type);
}
services
.AddSingleton<ICrawlerProvider, CrawlerProvider>();
return services;
}
}

View File

@@ -0,0 +1,20 @@
namespace Producer.Features.CrawlerSupport;
// Torrent represents a crawled torrent from one of our
// supported sources.
public class Torrent
{
public long? Id { get; set; }
public string? Name { get; set; }
public string? Source { get; set; }
public string? Category { get; set; }
public string? InfoHash { get; set; }
public string? Size { get; set; }
public int Seeders { get; set; }
public int Leechers { get; set; }
public string? Imdb { get; set; }
public bool Processed { get; set; } = false;
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
}