Woke up to see a discussion about torrentio scraping: powered by community

Was a little inspired. Now we have a database (self populating) of imdb id's - why shouldn't we actually have the ability to scrape any other instance of torrentio, or knightcrawler?

Also restructured the producer to be vertically sliced to make it easier to work with
Too much flicking back and forth between Jobs and Crawlers when configuring
This commit is contained in:
iPromKnight
2024-03-02 18:41:57 +00:00
parent 98115e0cf7
commit 95fa48c851
59 changed files with 733 additions and 261 deletions

View File

@@ -30,6 +30,11 @@
"Name": "SyncDmmJob",
"IntervalSeconds": 1800,
"Enabled": true
},
{
"Name": "SyncTorrentioJob",
"IntervalSeconds": 604800,
"Enabled": true
}
]
}

View File

@@ -0,0 +1,14 @@
{
"TorrentioConfiguration": {
"Instances": [
{
"Name": "Official",
"Url": "https://torrentio.strem.fun",
"RateLimit": {
"RequestLimit": 300,
"IntervalInSeconds": 3600
}
}
]
}
}

View File

@@ -11,6 +11,7 @@ public static class ConfigurationExtensions
configuration.AddJsonFile(LoggingConfig, false, true);
configuration.AddJsonFile(ScrapeConfiguration.Filename, false, true);
configuration.AddJsonFile(TorrentioConfiguration.Filename, false, true);
configuration.AddEnvironmentVariables();
@@ -18,4 +19,28 @@ public static class ConfigurationExtensions
return configuration;
}
public static TConfiguration LoadConfigurationFromConfig<TConfiguration>(this IServiceCollection services, IConfiguration configuration, string sectionName)
where TConfiguration : class
{
var instance = configuration.GetSection(sectionName).Get<TConfiguration>();
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
services.TryAddSingleton(instance);
return instance;
}
public static TConfiguration LoadConfigurationFromEnv<TConfiguration>(this IServiceCollection services)
where TConfiguration : class
{
var instance = Activator.CreateInstance<TConfiguration>();
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
services.TryAddSingleton(instance);
return instance;
}
}

View File

@@ -1,160 +0,0 @@
namespace Producer.Extensions;
public static class ServiceCollectionExtensions
{
internal static IServiceCollection AddCrawlers(this IServiceCollection services)
{
services.AddHttpClient("Scraper");
services
.AddKeyedTransient<ICrawler, EzTvCrawler>(nameof(EzTvCrawler))
.AddKeyedTransient<ICrawler, NyaaCrawler>(nameof(NyaaCrawler))
.AddKeyedTransient<ICrawler, YtsCrawler>(nameof(YtsCrawler))
.AddKeyedTransient<ICrawler, TpbCrawler>(nameof(TpbCrawler))
.AddKeyedTransient<ICrawler, TgxCrawler>(nameof(TgxCrawler))
.AddKeyedTransient<ICrawler, DebridMediaManagerCrawler>(nameof(DebridMediaManagerCrawler))
.AddSingleton<ICrawlerProvider, CrawlerProvider>()
.AddTransient<IIpService, IpService>();
return services;
}
internal static IServiceCollection AddDataStorage(this IServiceCollection services)
{
services.LoadConfigurationFromEnv<PostgresConfiguration>();
services.AddTransient<IDataStorage, DapperDataStorage>();
services.AddTransient<IMessagePublisher, TorrentPublisher>();
return services;
}
internal static IServiceCollection RegisterMassTransit(this IServiceCollection services)
{
var rabbitConfig = services.LoadConfigurationFromEnv<RabbitMqConfiguration>();
services.AddMassTransit(busConfigurator =>
{
busConfigurator.SetKebabCaseEndpointNameFormatter();
busConfigurator.UsingRabbitMq((_, busFactoryConfigurator) =>
{
busFactoryConfigurator.Host(rabbitConfig.Host, hostConfigurator =>
{
hostConfigurator.Username(rabbitConfig.Username);
hostConfigurator.Password(rabbitConfig.Password);
});
});
});
return services;
}
internal static IServiceCollection AddQuartz(this IServiceCollection services, IConfiguration configuration)
{
var scrapeConfiguration = services.LoadConfigurationFromConfig<ScrapeConfiguration>(configuration, ScrapeConfiguration.SectionName);
var githubConfiguration = services.LoadConfigurationFromEnv<GithubConfiguration>();
var rabbitConfig = services.LoadConfigurationFromEnv<RabbitMqConfiguration>();
services
.AddTransient<SyncEzTvJob>()
.AddTransient<SyncNyaaJob>()
.AddTransient<SyncTpbJob>()
.AddTransient<SyncYtsJob>()
.AddTransient<SyncTgxJob>()
.AddTransient<IPJob>()
.AddTransient<PublisherJob>();
if (!string.IsNullOrEmpty(githubConfiguration.PAT))
{
services.AddTransient<SyncDmmJob>();
}
services.AddQuartz(
quartz =>
{
AddJobWithTrigger<SyncEzTvJob>(quartz, SyncEzTvJob.Key, SyncEzTvJob.Trigger, scrapeConfiguration);
AddJobWithTrigger<SyncNyaaJob>(quartz, SyncNyaaJob.Key, SyncNyaaJob.Trigger, scrapeConfiguration);
AddJobWithTrigger<SyncTpbJob>(quartz, SyncTpbJob.Key, SyncTpbJob.Trigger, scrapeConfiguration);
AddJobWithTrigger<SyncYtsJob>(quartz, SyncYtsJob.Key, SyncYtsJob.Trigger, scrapeConfiguration);
AddJobWithTrigger<SyncTgxJob>(quartz, SyncTgxJob.Key, SyncTgxJob.Trigger, scrapeConfiguration);
AddJobWithTrigger<IPJob>(quartz, IPJob.Key, IPJob.Trigger, 60 * 5);
AddJobWithTrigger<PublisherJob>(quartz, PublisherJob.Key, PublisherJob.Trigger, rabbitConfig.PublishIntervalInSeconds);
if (!string.IsNullOrEmpty(githubConfiguration.PAT))
{
AddJobWithTrigger<SyncDmmJob>(quartz, SyncDmmJob.Key, SyncDmmJob.Trigger, scrapeConfiguration);
}
});
services.AddQuartzHostedService(
options =>
{
options.WaitForJobsToComplete = true;
});
return services;
}
private static TConfiguration LoadConfigurationFromConfig<TConfiguration>(this IServiceCollection services, IConfiguration configuration, string sectionName)
where TConfiguration : class
{
var instance = configuration.GetSection(sectionName).Get<TConfiguration>();
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
services.TryAddSingleton(instance);
return instance;
}
private static TConfiguration LoadConfigurationFromEnv<TConfiguration>(this IServiceCollection services)
where TConfiguration : class
{
var instance = Activator.CreateInstance<TConfiguration>();
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
services.TryAddSingleton(instance);
return instance;
}
private static void AddJobWithTrigger<TJobType>(
IServiceCollectionQuartzConfigurator quartz,
JobKey key,
TriggerKey trigger,
ScrapeConfiguration scrapeConfiguration) where TJobType : IJob
{
var scraper = scrapeConfiguration.Scrapers
.FirstOrDefault(x => x.Name != null &&
x.Name.Equals(typeof(TJobType).Name, StringComparison.OrdinalIgnoreCase));
if (scraper is null || !scraper.Enabled)
{
return;
}
quartz.AddJob<TJobType>(opts => opts.WithIdentity(key).StoreDurably());
quartz.AddTrigger(
opts => opts
.ForJob(key)
.WithIdentity(trigger)
.StartAt(DateTimeOffset.Now.AddSeconds(20))
.WithSimpleSchedule(x => x.WithInterval(TimeSpan.FromSeconds(scraper.IntervalSeconds)).RepeatForever()));
}
private static void AddJobWithTrigger<TJobType>(
IServiceCollectionQuartzConfigurator quartz,
JobKey key,
TriggerKey trigger,
int interval) where TJobType : IJob
{
quartz.AddJob<TJobType>(opts => opts.WithIdentity(key).StoreDurably());
quartz.AddTrigger(
opts => opts
.ForJob(key)
.WithIdentity(trigger)
.StartAt(DateTimeOffset.Now.AddSeconds(20))
.WithSimpleSchedule(x => x.WithInterval(TimeSpan.FromSeconds(interval)).RepeatForever()));
}
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Interfaces;
namespace Producer.Features.Amqp;
public interface IMessagePublisher
{

View File

@@ -1,11 +1,14 @@
namespace Producer.Jobs;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Features.Amqp;
[DisallowConcurrentExecution]
[ManualJobRegistration]
public class PublisherJob(IMessagePublisher publisher, IDataStorage storage, ILogger<PublisherJob> logger) : IJob
{
private const string JobName = nameof(PublisherJob);
public static readonly JobKey Key = new(JobName, nameof(Jobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Jobs));
public static readonly JobKey Key = new(JobName, nameof(Literals.PublishingJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.PublishingJobs));
public async Task Execute(IJobExecutionContext context)
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Models.Configuration;
namespace Producer.Features.Amqp;
public class RabbitMqConfiguration
{

View File

@@ -0,0 +1,24 @@
namespace Producer.Features.Amqp;
internal static class ServiceCollectionExtensions
{
internal static IServiceCollection RegisterMassTransit(this IServiceCollection services)
{
var rabbitConfig = services.LoadConfigurationFromEnv<RabbitMqConfiguration>();
services.AddMassTransit(busConfigurator =>
{
busConfigurator.SetKebabCaseEndpointNameFormatter();
busConfigurator.UsingRabbitMq((_, busFactoryConfigurator) =>
{
busFactoryConfigurator.Host(rabbitConfig.Host, hostConfigurator =>
{
hostConfigurator.Username(rabbitConfig.Username);
hostConfigurator.Password(rabbitConfig.Password);
});
});
});
return services;
}
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Services;
namespace Producer.Features.Amqp;
public class TorrentPublisher(
ISendEndpointProvider sendEndpointProvider,

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers;
namespace Producer.Features.CrawlerSupport;
public abstract class BaseCrawler(ILogger<BaseCrawler> logger, IDataStorage storage) : ICrawler
{

View File

@@ -1,8 +1,8 @@
namespace Producer.Crawlers;
namespace Producer.Features.CrawlerSupport;
public abstract class BaseJsonCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseJsonCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
{
private readonly HttpClient _client = httpClientFactory.CreateClient("Scraper");
private readonly HttpClient _client = httpClientFactory.CreateClient(Literals.CrawlerClient);
protected virtual async Task Execute(string collectionName)
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers;
namespace Producer.Features.CrawlerSupport;
public abstract class BaseXmlCrawler(IHttpClientFactory httpClientFactory, ILogger<BaseXmlCrawler> logger, IDataStorage storage) : BaseCrawler(logger, storage)
{
@@ -6,7 +6,7 @@ public abstract class BaseXmlCrawler(IHttpClientFactory httpClientFactory, ILogg
{
logger.LogInformation("Starting {Source} crawl", Source);
using var client = httpClientFactory.CreateClient("Scraper");
using var client = httpClientFactory.CreateClient(Literals.CrawlerClient);
var xml = await client.GetStringAsync(Url);
var xmlRoot = XElement.Parse(xml);

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers;
namespace Producer.Features.CrawlerSupport;
public class CrawlerProvider(IServiceProvider serviceProvider) : ICrawlerProvider
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Interfaces;
namespace Producer.Features.CrawlerSupport;
public interface ICrawler
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Interfaces;
namespace Producer.Features.CrawlerSupport;
public interface ICrawlerProvider
{

View File

@@ -0,0 +1,6 @@
namespace Producer.Features.CrawlerSupport;
public static class Literals
{
public const string CrawlerClient = "Scraper";
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Models;
namespace Producer.Features.CrawlerSupport;
public record InsertTorrentResult(bool Success, int InsertedCount = 0, string? ErrorMessage = null);
public record UpdatedTorrentResult(bool Success, int UpdatedCount = 0, string? ErrorMessage = null);

View File

@@ -1,4 +1,4 @@
namespace Producer.Models;
namespace Producer.Features.CrawlerSupport;
public class Scraper
{

View File

@@ -0,0 +1,23 @@
namespace Producer.Features.CrawlerSupport;
internal static class ServiceCollectionExtensions
{
internal static IServiceCollection AddCrawlers(this IServiceCollection services)
{
services.AddHttpClient(Literals.CrawlerClient);
var crawlerTypes = Assembly.GetAssembly(typeof(ICrawler))
.GetTypes()
.Where(t => t is {IsClass: true, IsAbstract: false} && typeof(ICrawler).IsAssignableFrom(t));
foreach (var type in crawlerTypes)
{
services.AddKeyedTransient(typeof(ICrawler), type.Name, type);
}
services
.AddSingleton<ICrawlerProvider, CrawlerProvider>();
return services;
}
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Models;
namespace Producer.Features.CrawlerSupport;
// Torrent represents a crawled torrent from one of our
// supported sources.

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.Dmm;
public partial class DebridMediaManagerCrawler(
IHttpClientFactory httpClientFactory,

View File

@@ -1,4 +1,4 @@
namespace Producer.Models.Configuration;
namespace Producer.Features.Crawlers.Dmm;
public class GithubConfiguration
{

View File

@@ -1,12 +1,13 @@
using DebridMediaManagerCrawler = Producer.Crawlers.Sites.DebridMediaManagerCrawler;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.Dmm;
[DisallowConcurrentExecution]
[ManualJobRegistration]
public class SyncDmmJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(DebridMediaManagerCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(DebridMediaManagerCrawler);
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.EzTv;
public class EzTvCrawler(IHttpClientFactory httpClientFactory, ILogger<EzTvCrawler> logger, IDataStorage storage) : BaseXmlCrawler(httpClientFactory, logger, storage)
{

View File

@@ -1,12 +1,12 @@
using Producer.Crawlers.Sites;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.EzTv;
[DisallowConcurrentExecution]
public class SyncEzTvJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(EzTvCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(EzTvCrawler);
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.Nyaa;
public class NyaaCrawler(IHttpClientFactory httpClientFactory, ILogger<NyaaCrawler> logger, IDataStorage storage) : BaseXmlCrawler(httpClientFactory, logger, storage)
{

View File

@@ -1,12 +1,12 @@
using Producer.Crawlers.Sites;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.Nyaa;
[DisallowConcurrentExecution]
public class SyncNyaaJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(NyaaCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(NyaaCrawler);
}

View File

@@ -1,12 +1,12 @@
using TgxCrawler = Producer.Crawlers.Sites.TgxCrawler;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.Tgx;
[DisallowConcurrentExecution]
public class SyncTgxJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(TgxCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(TgxCrawler);
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.Tgx;
public partial class TgxCrawler(IHttpClientFactory httpClientFactory, ILogger<TgxCrawler> logger, IDataStorage storage) : BaseXmlCrawler(httpClientFactory, logger, storage)
{

View File

@@ -0,0 +1,13 @@
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Features.Crawlers.Torrentio;
[DisallowConcurrentExecution]
[ManualJobRegistration]
public class SyncTorrentioJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(TorrentioCrawler);
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(TorrentioCrawler);
}

View File

@@ -0,0 +1,9 @@
namespace Producer.Features.Crawlers.Torrentio;
public class TorrentioConfiguration
{
public const string SectionName = "TorrentioConfiguration";
public const string Filename = "torrentio.json";
public List<TorrentioInstance> Instances { get; set; } = [];
}

View File

@@ -0,0 +1,197 @@
using Literals = Producer.Features.CrawlerSupport.Literals;
namespace Producer.Features.Crawlers.Torrentio;
public partial class TorrentioCrawler(
IHttpClientFactory httpClientFactory,
ILogger<TorrentioCrawler> logger,
IDataStorage storage,
TorrentioConfiguration configuration,
ImdbMongoDbService imdbDataService) : BaseCrawler(logger, storage)
{
[GeneratedRegex(@"(\d+(\.\d+)?) (GB|MB)")]
private static partial Regex SizeMatcher();
private const string MovieSlug = "movie/{0}.json";
protected override string Url => "sort=size%7Cqualityfilter=other,scr,cam,unknown/stream/{0}";
protected override IReadOnlyDictionary<string, string> Mappings { get; } = new Dictionary<string, string>();
protected override string Source => "Torrentio";
private readonly Dictionary<string, TorrentioScrapeInstance> _instanceStates = [];
public override async Task Execute()
{
var client = httpClientFactory.CreateClient(Literals.CrawlerClient);
var instances = configuration.Instances;
var totalRecordCount = await imdbDataService.GetTotalCountAsync();
logger.LogInformation("Total IMDB records to process: {TotalRecordCount}", totalRecordCount);
var tasks = instances.Select(x => ProcessForInstanceAsync(x, client, totalRecordCount)).ToArray();
await Task.WhenAll(tasks);
}
private Task ProcessForInstanceAsync(TorrentioInstance instance, HttpClient client, long totalRecordCount) =>
Task.Run(
async () =>
{
while (instance.TotalProcessedRequests(_instanceStates) < totalRecordCount)
{
logger.LogInformation("Processing {TorrentioInstance}", instance.Name);
logger.LogInformation("Current processed requests: {ProcessedRequests}", instance.TotalProcessedRequests(_instanceStates));
var items = await imdbDataService.GetImdbEntriesForRequests(
DateTime.UtcNow.Year.ToString(),
instance.RateLimit.RequestLimit,
instance.LastProcessedImdbId(_instanceStates));
if (items.Count == 0)
{
logger.LogInformation("No items to process for {TorrentioInstance}", instance.Name);
break;
}
var newTorrents = new List<Torrent>();
var processedItemsCount = 0;
foreach (var item in items)
{
processedItemsCount++;
var waitTime = instance.CalculateWaitTime(_instanceStates);
if (waitTime > TimeSpan.Zero)
{
logger.LogInformation("Rate limit reached for {TorrentioInstance}", instance.Name);
logger.LogInformation("Waiting for {TorrentioInstance}: {WaitTime}", instance.Name, waitTime);
await Task.Delay(waitTime);
}
if (processedItemsCount % 2 == 0)
{
var randomWait = new Random().Next(1000, 5000);
logger.LogInformation("Waiting for {TorrentioInstance}: {WaitTime}", instance.Name, randomWait);
await Task.Delay(randomWait);
}
try
{
var torrentInfo = await ScrapeInstance(instance, item.ImdbId, client);
if (torrentInfo is not null)
{
newTorrents.AddRange(torrentInfo.Where(x => x != null).Select(x => x!));
}
}
catch (Exception error)
{
logger.LogError(error, "page processing error in TorrentioCrawler");
}
}
if (newTorrents.Count > 0)
{
await InsertTorrents(newTorrents);
var currentState = _instanceStates[instance.Name];
_instanceStates[instance.Name] = currentState with
{
LastProcessedImdbId = items[^1].ImdbId,
};
}
}
});
private async Task<List<Torrent?>?> ScrapeInstance(TorrentioInstance instance, string imdbId, HttpClient client)
{
logger.LogInformation("Searching Torrentio {TorrentioInstance}: {ImdbId}", instance.Name, imdbId);
try
{
var movieSlug = string.Format(MovieSlug, imdbId);
var urlSlug = string.Format(Url, movieSlug);
return await RunRequest(instance, urlSlug, imdbId, client);
}
catch (Exception error)
{
logger.LogError(error, "page processing error {TorrentioInstance}: {ImdbId}", instance.Name, imdbId);
logger.LogWarning("Setting {TorrentioInstance} to possibly rate limited for Five minutes", instance.Name);
instance.SetPossiblyRateLimited(_instanceStates);
}
return null;
}
private async Task<List<Torrent?>?> RunRequest(TorrentioInstance instance, string urlSlug, string imdbId, HttpClient client)
{
var requestUrl = $"{instance.Url}/{urlSlug}";
var response = await client.GetAsync(requestUrl);
if (!response.IsSuccessStatusCode)
{
logger.LogError("Failed to fetch {Url}", requestUrl);
return null;
}
var json = JsonDocument.Parse(await response.Content.ReadAsStringAsync());
var streams = json.RootElement.GetProperty("streams").EnumerateArray();
return streams.Select(x => ParseTorrent(instance, x, imdbId)).Where(x => x != null).ToList();
}
private Torrent? ParseTorrent(TorrentioInstance instance, JsonElement item, string imdId)
{
var title = item.GetProperty("title").GetString();
var infoHash = item.GetProperty("infoHash").GetString();
if (string.IsNullOrEmpty(title) || string.IsNullOrEmpty(infoHash))
{
return null;
}
var torrent = ParseTorrentDetails(title, instance, infoHash, imdId);
if (torrent is null || string.IsNullOrEmpty(torrent.Name))
{
return null;
}
return torrent;
}
private Torrent? ParseTorrentDetails(string title, TorrentioInstance instance, string infoHash, string imdbId)
{
try
{
var torrent = new Torrent
{
Source = $"{Source}_{instance.Name}",
InfoHash = infoHash,
Category = "movies", // we only handle movies for now...
Imdb = imdbId,
};
var span = title.AsSpan();
var titleEnd = span.IndexOf('\n');
var titlePart = titleEnd >= 0 ? span[..titleEnd].ToString() : title;
torrent.Name = titlePart.Replace('.', ' ').TrimEnd('.');
var sizeMatch = SizeMatcher().Match(title);
if (sizeMatch.Success)
{
var size = double.Parse(sizeMatch.Groups[1].Value); // Size Value
var sizeUnit = sizeMatch.Groups[3].Value; // Size Unit (GB/MB)
var sizeInBytes = sizeUnit switch
{
"GB" => (long) (size * 1073741824),
"MB" => (long) (size * 1048576),
_ => 0,
};
torrent.Size = sizeInBytes.ToString();
}
return torrent;
}
catch (Exception e)
{
logger.LogError(e, "Error parsing torrent details");
return null;
}
}
}

View File

@@ -0,0 +1,10 @@
namespace Producer.Features.Crawlers.Torrentio;
public class TorrentioInstance
{
public string Name { get; init; } = default!;
public string Url { get; init; } = default!;
public TorrentioRateLimit RateLimit { get; init; } = default!;
}

View File

@@ -0,0 +1,53 @@
namespace Producer.Features.Crawlers.Torrentio;
public static class TorrentioInstancesExtensions
{
public static TimeSpan CalculateWaitTime(this TorrentioInstance instance, Dictionary<string, TorrentioScrapeInstance> scraperState)
{
if (!scraperState.TryGetValue(instance.Name, out var state))
{
state = new (DateTime.UtcNow, 0, 0, null);
scraperState[instance.Name] = state;
}
var (startedAt, requestCount, totalProcessed, lastProcessedImdbId) = state;
if (requestCount < instance.RateLimit.RequestLimit)
{
scraperState[instance.Name] = new (startedAt, requestCount + 1, totalProcessed + 1, lastProcessedImdbId);
return TimeSpan.Zero;
}
var elapsed = DateTime.UtcNow - startedAt;
var interval = TimeSpan.FromSeconds(instance.RateLimit.IntervalInSeconds);
var remaining = interval - elapsed;
// reset the state for the next interval
scraperState[instance.Name] = new (DateTime.UtcNow, 0, totalProcessed, lastProcessedImdbId);
return remaining > TimeSpan.Zero ? remaining : TimeSpan.Zero;
}
public static void SetPossiblyRateLimited(this TorrentioInstance instance, Dictionary<string, TorrentioScrapeInstance> scraperState, int minutesToWait = 5)
{
if (!scraperState.TryGetValue(instance.Name, out var state))
{
state = new (DateTime.UtcNow, 0, 0, null);
}
var (_, _, totalProcessed, lastProcessedImdbId) = state;
// Set the start time to 15 minutes in the past so that the next check will result in a rate limit period of 15 minutes
var startedAt = DateTime.UtcNow.AddMinutes(-minutesToWait);
var requestCount = instance.RateLimit.RequestLimit;
// Update the scraper state for the instance
scraperState[instance.Name] = new (startedAt, requestCount, totalProcessed, lastProcessedImdbId);
}
public static long TotalProcessedRequests(this TorrentioInstance instance, Dictionary<string, TorrentioScrapeInstance> scraperState) =>
!scraperState.TryGetValue(instance.Name, out var state) ? 0 : state.TotalProcessed;
public static string? LastProcessedImdbId(this TorrentioInstance instance, Dictionary<string, TorrentioScrapeInstance> scraperState) =>
!scraperState.TryGetValue(instance.Name, out var state) ? null : state.LastProcessedImdbId;
}

View File

@@ -0,0 +1,7 @@
namespace Producer.Features.Crawlers.Torrentio;
public class TorrentioRateLimit
{
public int RequestLimit { get; set; }
public int IntervalInSeconds { get; set; }
}

View File

@@ -0,0 +1,3 @@
namespace Producer.Features.Crawlers.Torrentio;
public record TorrentioScrapeInstance(DateTime StartedAt, int RequestCount, int TotalProcessed, string? LastProcessedImdbId);

View File

@@ -1,12 +1,12 @@
using Producer.Crawlers.Sites;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.Tpb;
[DisallowConcurrentExecution]
public class SyncTpbJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(TpbCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(TpbCrawler);
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.Tpb;
public class TpbCrawler(IHttpClientFactory httpClientFactory, ILogger<TpbCrawler> logger, IDataStorage storage) : BaseJsonCrawler(httpClientFactory, logger, storage)
{

View File

@@ -1,12 +1,12 @@
using Producer.Crawlers.Sites;
using Literals = Producer.Features.JobSupport.Literals;
namespace Producer.Jobs;
namespace Producer.Features.Crawlers.Yts;
[DisallowConcurrentExecution]
public class SyncYtsJob(ICrawlerProvider crawlerProvider) : BaseJob(crawlerProvider)
{
private const string JobName = nameof(YtsCrawler);
public static readonly JobKey Key = new(JobName, nameof(Crawlers));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Crawlers));
public static readonly JobKey Key = new(JobName, nameof(Literals.CrawlersJobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Literals.CrawlersJobs));
protected override string Crawler => nameof(YtsCrawler);
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Crawlers.Sites;
namespace Producer.Features.Crawlers.Yts;
public class YtsCrawler(IHttpClientFactory httpClientFactory, ILogger<YtsCrawler> logger, IDataStorage storage) : BaseXmlCrawler(httpClientFactory, logger, storage)
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Services;
namespace Producer.Features.DataProcessing;
public class DapperDataStorage(PostgresConfiguration configuration, RabbitMqConfiguration rabbitConfig, ILogger<DapperDataStorage> logger) : IDataStorage
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Interfaces;
namespace Producer.Features.DataProcessing;
public interface IDataStorage
{

View File

@@ -0,0 +1,15 @@
namespace Producer.Features.DataProcessing;
public class ImdbEntry
{
[BsonId]
public string ImdbId { get; set; } = default!;
public string? TitleType { get; set; }
public string? PrimaryTitle { get; set; }
public string? OriginalTitle { get; set; }
public string? IsAdult { get; set; }
public string? StartYear { get; set; }
public string? EndYear { get; set; }
public string? RuntimeMinutes { get; set; }
public string? Genres { get; set; }
}

View File

@@ -0,0 +1,78 @@
namespace Producer.Features.DataProcessing;
public class ImdbMongoDbService
{
private readonly ILogger<ImdbMongoDbService> _logger;
private readonly IMongoCollection<ImdbEntry> _imdbCollection;
public ImdbMongoDbService(MongoConfiguration configuration, ILogger<ImdbMongoDbService> logger)
{
_logger = logger;
var client = new MongoClient(configuration.ConnectionString);
var database = client.GetDatabase(configuration.DbName);
_imdbCollection = database.GetCollection<ImdbEntry>("imdb-entries");
}
public async Task<IReadOnlyList<ImdbEntry>> GetImdbEntriesForRequests(string startYear, int requestLimit, string? startingId = null)
{
var sort = Builders<ImdbEntry>.Sort
.Descending(e => e.StartYear)
.Descending(e => e.ImdbId);
var filter = Builders<ImdbEntry>.Filter
.And(
Builders<ImdbEntry>.Filter.Eq(e => e.TitleType, "movie"),
Builders<ImdbEntry>.Filter.Lte(e => e.StartYear, startYear)
);
if (!string.IsNullOrWhiteSpace(startingId))
{
filter = Builders<ImdbEntry>.Filter.And(filter, Builders<ImdbEntry>.Filter.Lt(e => e.ImdbId, startingId));
}
return await _imdbCollection.Find(filter).Limit(requestLimit).Sort(sort).ToListAsync();
}
public async Task<long> GetTotalCountAsync()
{
var filter = Builders<ImdbEntry>.Filter.Eq(x => x.TitleType, "movie");
return await _imdbCollection.CountDocumentsAsync(filter);
}
public bool IsDatabaseInitialized()
{
try
{
// Compound index for PrimaryTitle, TitleType, and StartYear
var index1KeysDefinition = Builders<ImdbEntry>.IndexKeys
.Text(e => e.PrimaryTitle)
.Ascending(e => e.TitleType)
.Ascending(e => e.StartYear);
CreateIndex(index1KeysDefinition);
// Compound index for StartYear and _id in descending order
var index2KeysDefinition = Builders<ImdbEntry>.IndexKeys
.Descending(e => e.StartYear)
.Descending(e => e.ImdbId);
CreateIndex(index2KeysDefinition);
return true;
}
catch (Exception e)
{
_logger.LogError(e, "Error initializing database");
return false;
}
}
private void CreateIndex(IndexKeysDefinition<ImdbEntry> keysDefinition)
{
var createIndexOptions = new CreateIndexOptions { Background = true };
var indexModel = new CreateIndexModel<ImdbEntry>(keysDefinition, createIndexOptions);
_imdbCollection.Indexes.CreateOne(indexModel);
}
}

View File

@@ -0,0 +1,20 @@
namespace Producer.Features.DataProcessing;
public class MongoConfiguration
{
private const string Prefix = "MONGODB";
private const string HostVariable = "HOST";
private const string PortVariable = "PORT";
private const string DbVariable = "DB";
private const string UsernameVariable = "USER";
private const string PasswordVariable = "PASSWORD";
private string Host { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(HostVariable);
private int Port { get; init; } = Prefix.GetEnvironmentVariableAsInt(PortVariable, 27017);
private string Username { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(UsernameVariable);
private string Password { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(PasswordVariable);
public string DbName { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(DbVariable);
public string ConnectionString => $"mongodb://{Username}:{Password}@{Host}:{Port}/{DbName}?tls=false&directConnection=true&authSource=admin";
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Models.Configuration;
namespace Producer.Features.DataProcessing;
public class PostgresConfiguration
{

View File

@@ -0,0 +1,14 @@
namespace Producer.Features.DataProcessing;
internal static class ServiceCollectionExtensions
{
internal static IServiceCollection AddDataStorage(this IServiceCollection services)
{
services.LoadConfigurationFromEnv<PostgresConfiguration>();
services.LoadConfigurationFromEnv<MongoConfiguration>();
services.AddTransient<IDataStorage, DapperDataStorage>();
services.AddTransient<IMessagePublisher, TorrentPublisher>();
services.AddSingleton<ImdbMongoDbService>();
return services;
}
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Jobs;
namespace Producer.Features.JobSupport;
public abstract class BaseJob(ICrawlerProvider crawlerProvider) : IJob
{

View File

@@ -1,4 +1,4 @@
namespace Producer.Jobs;
namespace Producer.Features.JobSupport;
public interface ICrawlerJob<out TCrawler> : IJob
where TCrawler : ICrawler

View File

@@ -0,0 +1,7 @@
namespace Producer.Features.JobSupport;
public static class Literals
{
public const string CrawlersJobs = "CrawlersJobs";
public const string PublishingJobs = "PublishingJobs";
}

View File

@@ -0,0 +1,6 @@
namespace Producer.Features.JobSupport;
[AttributeUsage(AttributeTargets.Class)]
public class ManualJobRegistrationAttribute : Attribute
{
}

View File

@@ -1,4 +1,4 @@
namespace Producer.Models.Configuration;
namespace Producer.Features.JobSupport;
public class ScrapeConfiguration
{

View File

@@ -0,0 +1,131 @@
namespace Producer.Features.JobSupport;
internal static class ServiceCollectionExtensions
{
internal static IServiceCollection AddQuartz(this IServiceCollection services, IConfiguration configuration)
{
var scrapeConfiguration = services.LoadConfigurationFromConfig<ScrapeConfiguration>(configuration, ScrapeConfiguration.SectionName);
var githubConfiguration = services.LoadConfigurationFromEnv<GithubConfiguration>();
var rabbitConfiguration = services.LoadConfigurationFromEnv<RabbitMqConfiguration>();
var jobTypes = Assembly.GetAssembly(typeof(BaseJob))
.GetTypes()
.Where(t => t is {IsClass: true, IsAbstract: false} && typeof(IJob).IsAssignableFrom(t) &&
!Attribute.IsDefined(t, typeof(ManualJobRegistrationAttribute)))
.ToList();
foreach (var type in jobTypes)
{
services.AddTransient(type);
}
if (!string.IsNullOrEmpty(githubConfiguration.PAT))
{
services.AddTransient<SyncDmmJob>();
}
var openMethod = typeof(ServiceCollectionExtensions).GetMethod(nameof(AddJobWithTrigger), BindingFlags.NonPublic | BindingFlags.Static | BindingFlags.Instance);
services.AddQuartz(
quartz =>
{
RegisterAutomaticRegistrationJobs(jobTypes, openMethod, quartz, scrapeConfiguration);
RegisterDmmJob(githubConfiguration, quartz, scrapeConfiguration);
RegisterTorrentioJob(services, quartz, configuration, scrapeConfiguration);
RegisterPublisher(quartz, rabbitConfiguration);
});
services.AddQuartzHostedService(
options =>
{
options.WaitForJobsToComplete = true;
});
return services;
}
private static void RegisterAutomaticRegistrationJobs(List<Type> jobTypes, MethodInfo? openMethod, IServiceCollectionQuartzConfigurator quartz,
ScrapeConfiguration scrapeConfiguration)
{
foreach (var jobType in jobTypes)
{
var key = jobType.GetField("Key")?.GetValue(jobType);
var trigger = jobType.GetField("Trigger")?.GetValue(jobType);
if (key is null || trigger is null)
{
Console.WriteLine($"Job {jobType.Name} does not have a JobKey or TriggerKey property");
continue;
}
var method = openMethod.MakeGenericMethod(jobType);
method.Invoke(null, [quartz, key, trigger, scrapeConfiguration]);
}
}
private static void RegisterDmmJob(GithubConfiguration githubConfiguration, IServiceCollectionQuartzConfigurator quartz, ScrapeConfiguration scrapeConfiguration)
{
if (!string.IsNullOrEmpty(githubConfiguration.PAT))
{
AddJobWithTrigger<SyncDmmJob>(quartz, SyncDmmJob.Key, SyncDmmJob.Trigger, scrapeConfiguration);
}
}
private static void RegisterTorrentioJob(
IServiceCollection services,
IServiceCollectionQuartzConfigurator quartz,
IConfiguration configuration,
ScrapeConfiguration scrapeConfiguration)
{
var torrentioConfiguration = services.LoadConfigurationFromConfig<TorrentioConfiguration>(configuration, TorrentioConfiguration.SectionName);
if (torrentioConfiguration.Instances.Count != 0)
{
AddJobWithTrigger<SyncTorrentioJob>(quartz, SyncTorrentioJob.Key, SyncTorrentioJob.Trigger, scrapeConfiguration);
}
}
private static void RegisterPublisher(IServiceCollectionQuartzConfigurator quartz, RabbitMqConfiguration rabbitConfig) =>
AddJobWithTriggerAndInterval<PublisherJob>(quartz, PublisherJob.Key, PublisherJob.Trigger, rabbitConfig.PublishIntervalInSeconds);
private static void AddJobWithTrigger<TJobType>(
IServiceCollectionQuartzConfigurator quartz,
JobKey key,
TriggerKey trigger,
ScrapeConfiguration scrapeConfiguration) where TJobType : IJob
{
var scraper = scrapeConfiguration.Scrapers
.FirstOrDefault(x => x.Name != null &&
x.Name.Equals(typeof(TJobType).Name, StringComparison.OrdinalIgnoreCase));
if (scraper is null || !scraper.Enabled)
{
return;
}
quartz.AddJob<TJobType>(opts => opts.WithIdentity(key).StoreDurably());
quartz.AddTrigger(
opts => opts
.ForJob(key)
.WithIdentity(trigger)
.StartAt(DateTimeOffset.Now.AddSeconds(20))
.WithSimpleSchedule(x => x.WithInterval(TimeSpan.FromSeconds(scraper.IntervalSeconds)).RepeatForever()));
}
private static void AddJobWithTriggerAndInterval<TJobType>(
IServiceCollectionQuartzConfigurator quartz,
JobKey key,
TriggerKey trigger,
int interval) where TJobType : IJob
{
quartz.AddJob<TJobType>(opts => opts.WithIdentity(key).StoreDurably());
quartz.AddTrigger(
opts => opts
.ForJob(key)
.WithIdentity(trigger)
.StartAt(DateTimeOffset.Now.AddSeconds(20))
.WithSimpleSchedule(x => x.WithInterval(TimeSpan.FromSeconds(interval)).RepeatForever()));
}
}

View File

@@ -1,9 +1,10 @@
// Global using directives
global using System.Reflection;
global using System.Text;
global using System.Text.Json;
global using System.Text.Json.Serialization;
global using System.Text.RegularExpressions;
global using System.Threading.Channels;
global using System.Xml.Linq;
global using Dapper;
global using LZStringCSharp;
@@ -12,14 +13,15 @@ global using Microsoft.AspNetCore.Builder;
global using Microsoft.Extensions.DependencyInjection;
global using Microsoft.Extensions.DependencyInjection.Extensions;
global using Microsoft.Extensions.Logging;
global using MongoDB.Bson.Serialization.Attributes;
global using MongoDB.Driver;
global using Npgsql;
global using Quartz;
global using Producer.Crawlers;
global using Producer.Crawlers.Sites;
global using Producer.Extensions;
global using Producer.Interfaces;
global using Producer.Jobs;
global using Producer.Models;
global using Producer.Models.Configuration;
global using Producer.Services;
global using Producer.Features.Amqp;
global using Producer.Features.Crawlers.Dmm;
global using Producer.Features.Crawlers.Torrentio;
global using Producer.Features.CrawlerSupport;
global using Producer.Features.DataProcessing;
global using Producer.Features.JobSupport;
global using Serilog;

View File

@@ -1,6 +0,0 @@
namespace Producer.Interfaces;
public interface IIpService
{
Task GetPublicIpAddress();
}

View File

@@ -1,14 +0,0 @@
namespace Producer.Jobs;
[DisallowConcurrentExecution]
public class IPJob(IIpService ipService) : IJob
{
private const string JobName = nameof(IPJob);
public static readonly JobKey Key = new(JobName, nameof(Jobs));
public static readonly TriggerKey Trigger = new($"{JobName}-trigger", nameof(Jobs));
public Task Execute(IJobExecutionContext context)
{
return ipService.GetPublicIpAddress();
}
}

View File

@@ -15,6 +15,7 @@
<PackageReference Include="MassTransit.RabbitMQ" Version="8.1.3" />
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
<PackageReference Include="MongoDB.Driver" Version="2.24.0" />
<PackageReference Include="Npgsql" Version="8.0.1" />
<PackageReference Include="Quartz.Extensions.DependencyInjection" Version="3.8.0" />
<PackageReference Include="Quartz.Extensions.Hosting" Version="3.8.0" />
@@ -24,12 +25,8 @@
</ItemGroup>
<ItemGroup>
<Content Remove="Configuration\scrapers.json" />
<None Include="Configuration\scrapers.json">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<Content Remove="Configuration\logging.json" />
<None Include="Configuration\logging.json">
<Content Remove="Configuration\*.json" />
<None Include="Configuration\*.json">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>

View File

@@ -1,13 +0,0 @@
namespace Producer.Services;
public class IpService(ILogger<IpService> logger, IHttpClientFactory httpClientFactory) : IIpService
{
public async Task GetPublicIpAddress()
{
var client = httpClientFactory.CreateClient("Scraper");
client.DefaultRequestHeaders.UserAgent.ParseAdd("curl");
var request = await client.GetStringAsync("http://ifconfig.me");
logger.LogInformation("Public IP Address: {PublicIPAddress}", request);
}
}