mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
Woke up to see a discussion about torrentio scraping: powered by community
Was a little inspired. Now we have a database (self populating) of imdb id's - why shouldn't we actually have the ability to scrape any other instance of torrentio, or knightcrawler? Also restructured the producer to be vertically sliced to make it easier to work with Too much flicking back and forth between Jobs and Crawlers when configuring
This commit is contained in:
25
src/producer/Features/CrawlerSupport/BaseCrawler.cs
Normal file
25
src/producer/Features/CrawlerSupport/BaseCrawler.cs
Normal file
@@ -0,0 +1,25 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public abstract class BaseCrawler(ILogger<BaseCrawler> logger, IDataStorage storage) : ICrawler
|
||||
{
|
||||
protected abstract IReadOnlyDictionary<string, string> Mappings { get; }
|
||||
protected abstract string Url { get; }
|
||||
protected abstract string Source { get; }
|
||||
protected IDataStorage Storage => storage;
|
||||
|
||||
public virtual Task Execute() => Task.CompletedTask;
|
||||
|
||||
protected async Task<InsertTorrentResult> InsertTorrents(IReadOnlyCollection<Torrent> torrent)
|
||||
{
|
||||
var result = await storage.InsertTorrents(torrent);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
logger.LogWarning("Ingestion Failed: [{Error}]", result.ErrorMessage);
|
||||
return result;
|
||||
}
|
||||
|
||||
logger.LogInformation("Ingestion Successful - Wrote {Count} new torrents", result.InsertedCount);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
45
src/producer/Features/CrawlerSupport/BaseJsonCrawler.cs
Normal file
45
src/producer/Features/CrawlerSupport/BaseJsonCrawler.cs
Normal file
@@ -0,0 +1,45 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public abstract class BaseJsonCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseJsonCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
|
||||
{
|
||||
private readonly HttpClient _client = httpClientFactory.CreateClient(Literals.CrawlerClient);
|
||||
|
||||
protected virtual async Task Execute(string collectionName)
|
||||
{
|
||||
logger.LogInformation("Starting {Source} crawl", Source);
|
||||
|
||||
using var client = httpClientFactory.CreateClient("Scraper");
|
||||
var jsonBody = await _client.GetStringAsync(Url);
|
||||
using var json = JsonDocument.Parse(jsonBody);
|
||||
|
||||
var torrents = json.RootElement.EnumerateArray()
|
||||
.Select(ParseTorrent)
|
||||
.Where(x => x is not null)
|
||||
.ToList();
|
||||
|
||||
if (torrents.Count == 0)
|
||||
{
|
||||
logger.LogWarning("No torrents found in {Source} response", Source);
|
||||
return;
|
||||
}
|
||||
|
||||
await InsertTorrents(torrents!);
|
||||
}
|
||||
|
||||
protected virtual void HandleInfoHash(JsonElement item, Torrent torrent, string infoHashKey)
|
||||
{
|
||||
if (!Mappings.ContainsKey(infoHashKey))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var infoHash = item.GetProperty(Mappings[infoHashKey]).GetString();
|
||||
|
||||
if (infoHash is not null)
|
||||
{
|
||||
torrent.InfoHash = infoHash;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract Torrent? ParseTorrent(JsonElement item);
|
||||
}
|
||||
43
src/producer/Features/CrawlerSupport/BaseXmlCrawler.cs
Normal file
43
src/producer/Features/CrawlerSupport/BaseXmlCrawler.cs
Normal file
@@ -0,0 +1,43 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public abstract class BaseXmlCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseXmlCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
|
||||
{
|
||||
public override async Task Execute()
|
||||
{
|
||||
logger.LogInformation("Starting {Source} crawl", Source);
|
||||
|
||||
using var client = httpClientFactory.CreateClient(Literals.CrawlerClient);
|
||||
var xml = await client.GetStringAsync(Url);
|
||||
var xmlRoot = XElement.Parse(xml);
|
||||
|
||||
var torrents = xmlRoot.Descendants("item")
|
||||
.Select(ParseTorrent)
|
||||
.Where(x => x is not null)
|
||||
.ToList();
|
||||
|
||||
if (torrents.Count == 0)
|
||||
{
|
||||
logger.LogWarning("No torrents found in {Source} response", Source);
|
||||
return;
|
||||
}
|
||||
|
||||
await InsertTorrents(torrents!);
|
||||
}
|
||||
|
||||
protected virtual void HandleInfoHash(XElement itemNode, Torrent torrent, string infoHashKey)
|
||||
{
|
||||
if (!Mappings.ContainsKey(infoHashKey))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var infoHash = itemNode.Element(Mappings[infoHashKey])?.Value;
|
||||
|
||||
if (infoHash is not null)
|
||||
{
|
||||
torrent.InfoHash = infoHash;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract Torrent? ParseTorrent(XElement itemNode);
|
||||
}
|
||||
11
src/producer/Features/CrawlerSupport/CrawlerProvider.cs
Normal file
11
src/producer/Features/CrawlerSupport/CrawlerProvider.cs
Normal file
@@ -0,0 +1,11 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public class CrawlerProvider(IServiceProvider serviceProvider) : ICrawlerProvider
|
||||
{
|
||||
public IEnumerable<ICrawler> GetAll() =>
|
||||
serviceProvider.GetServices<ICrawler>();
|
||||
|
||||
public ICrawler Get(string name) =>
|
||||
serviceProvider.GetRequiredKeyedService<ICrawler>(name);
|
||||
|
||||
}
|
||||
6
src/producer/Features/CrawlerSupport/ICrawler.cs
Normal file
6
src/producer/Features/CrawlerSupport/ICrawler.cs
Normal file
@@ -0,0 +1,6 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public interface ICrawler
|
||||
{
|
||||
Task Execute();
|
||||
}
|
||||
8
src/producer/Features/CrawlerSupport/ICrawlerProvider.cs
Normal file
8
src/producer/Features/CrawlerSupport/ICrawlerProvider.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public interface ICrawlerProvider
|
||||
{
|
||||
IEnumerable<ICrawler> GetAll();
|
||||
|
||||
ICrawler Get(string name);
|
||||
}
|
||||
6
src/producer/Features/CrawlerSupport/Literals.cs
Normal file
6
src/producer/Features/CrawlerSupport/Literals.cs
Normal file
@@ -0,0 +1,6 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public static class Literals
|
||||
{
|
||||
public const string CrawlerClient = "Scraper";
|
||||
}
|
||||
5
src/producer/Features/CrawlerSupport/Results.cs
Normal file
5
src/producer/Features/CrawlerSupport/Results.cs
Normal file
@@ -0,0 +1,5 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public record InsertTorrentResult(bool Success, int InsertedCount = 0, string? ErrorMessage = null);
|
||||
public record UpdatedTorrentResult(bool Success, int UpdatedCount = 0, string? ErrorMessage = null);
|
||||
public record PageIngestedResult(bool Success, string? ErrorMessage = null);
|
||||
10
src/producer/Features/CrawlerSupport/Scraper.cs
Normal file
10
src/producer/Features/CrawlerSupport/Scraper.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
public class Scraper
|
||||
{
|
||||
public string? Name { get; set; }
|
||||
|
||||
public int IntervalSeconds { get; set; } = 60;
|
||||
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
internal static class ServiceCollectionExtensions
|
||||
{
|
||||
internal static IServiceCollection AddCrawlers(this IServiceCollection services)
|
||||
{
|
||||
services.AddHttpClient(Literals.CrawlerClient);
|
||||
|
||||
var crawlerTypes = Assembly.GetAssembly(typeof(ICrawler))
|
||||
.GetTypes()
|
||||
.Where(t => t is {IsClass: true, IsAbstract: false} && typeof(ICrawler).IsAssignableFrom(t));
|
||||
|
||||
foreach (var type in crawlerTypes)
|
||||
{
|
||||
services.AddKeyedTransient(typeof(ICrawler), type.Name, type);
|
||||
}
|
||||
|
||||
services
|
||||
.AddSingleton<ICrawlerProvider, CrawlerProvider>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
20
src/producer/Features/CrawlerSupport/Torrent.cs
Normal file
20
src/producer/Features/CrawlerSupport/Torrent.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
namespace Producer.Features.CrawlerSupport;
|
||||
|
||||
// Torrent represents a crawled torrent from one of our
|
||||
// supported sources.
|
||||
public class Torrent
|
||||
{
|
||||
public long? Id { get; set; }
|
||||
public string? Name { get; set; }
|
||||
public string? Source { get; set; }
|
||||
public string? Category { get; set; }
|
||||
public string? InfoHash { get; set; }
|
||||
public string? Size { get; set; }
|
||||
public int Seeders { get; set; }
|
||||
public int Leechers { get; set; }
|
||||
public string? Imdb { get; set; }
|
||||
|
||||
public bool Processed { get; set; } = false;
|
||||
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
|
||||
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
|
||||
}
|
||||
Reference in New Issue
Block a user