Improve producer matching - Add tissue service
Tissue service will sanitize the existign database of ingested torrents by matching existing titles with new banned word lists. Now with added kleenex
This commit is contained in:
14
.github/workflows/build_tissue.yaml
vendored
Normal file
14
.github/workflows/build_tissue.yaml
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
name: Build and Push Tissue Service
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'src/tissue/**'
|
||||
|
||||
jobs:
|
||||
process:
|
||||
uses: ./.github/workflows/base_image_workflow.yaml
|
||||
secrets: inherit
|
||||
with:
|
||||
CONTEXT: ./src/tissue/
|
||||
IMAGE_NAME: knightcrawler-tissue
|
||||
@@ -17,5 +17,5 @@ repos:
|
||||
hooks:
|
||||
- id: codespell
|
||||
exclude: |
|
||||
(?x)^(src/node/consumer/test/.*|src/producer/Data/.*)$
|
||||
(?x)^(src/node/consumer/test/.*|src/producer/Data/.*|src/tissue/Data/.*)$
|
||||
args: ["-L", "strem,chage"]
|
||||
|
||||
@@ -35,7 +35,6 @@ adultprime
|
||||
adulttime
|
||||
adulttimepilots
|
||||
aebn
|
||||
anal
|
||||
agentredgirl
|
||||
alettaoceanempire
|
||||
alexismonroe
|
||||
@@ -123,7 +122,6 @@ aussieass
|
||||
aussiefellatioqueens
|
||||
aussiepov
|
||||
austinwilde
|
||||
av69
|
||||
avadawn
|
||||
avanal
|
||||
aventertainments
|
||||
@@ -715,7 +713,6 @@ hunterpov
|
||||
hushpass
|
||||
hussiepass
|
||||
hustlaz
|
||||
hustler
|
||||
hustlerhd
|
||||
hustlerparodies
|
||||
hustlerslesbians
|
||||
@@ -1670,8 +1667,6 @@ tube8
|
||||
x-art
|
||||
mommygotboobs
|
||||
threesomes
|
||||
dp
|
||||
onlyfans
|
||||
alterotic
|
||||
assy
|
||||
defloration
|
||||
|
||||
@@ -1,7 +1,24 @@
|
||||
namespace Producer.Extensions;
|
||||
|
||||
public static class StringExtensions
|
||||
public static partial class StringExtensions
|
||||
{
|
||||
[GeneratedRegex("[^a-zA-Z0-9 ]")]
|
||||
private static partial Regex NotAlphaNumeric();
|
||||
|
||||
private static readonly char[] separator = [' '];
|
||||
|
||||
public static bool IsNullOrEmpty(this string? value) =>
|
||||
string.IsNullOrEmpty(value);
|
||||
|
||||
public static string NormalizeTitle(this string title)
|
||||
{
|
||||
var alphanumericTitle = NotAlphaNumeric().Replace(title, " ");
|
||||
|
||||
var words = alphanumericTitle.Split(separator, StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(word => word.ToLower());
|
||||
|
||||
var normalizedTitle = string.Join(" ", words);
|
||||
|
||||
return normalizedTitle;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,21 @@
|
||||
namespace Producer.Features.ParseTorrentTitle;
|
||||
|
||||
public partial class ParsingService(IWordCollections wordCollections, ITorrentTitleParser torrentTitleParser) : IParsingService
|
||||
public partial class ParsingService : IParsingService
|
||||
{
|
||||
private readonly IWordCollections _wordCollections;
|
||||
private readonly ITorrentTitleParser _torrentTitleParser;
|
||||
private static readonly char[] WhitespaceSeparator = [' '];
|
||||
private HashSet<string> _compoundWords = [];
|
||||
|
||||
public ParsingService(IWordCollections wordCollections, ITorrentTitleParser torrentTitleParser)
|
||||
{
|
||||
_wordCollections = wordCollections;
|
||||
_torrentTitleParser = torrentTitleParser;
|
||||
|
||||
_compoundWords.UnionWith(_wordCollections.AdultCompoundPhrases);
|
||||
_compoundWords.UnionWith(_wordCollections.Jav);
|
||||
_compoundWords.UnionWith(_wordCollections.AdultStars);
|
||||
}
|
||||
|
||||
public string Naked(string title) =>
|
||||
NakedMatcher().Replace(title.ToLower(), "");
|
||||
@@ -198,8 +211,8 @@ public partial class ParsingService(IWordCollections wordCollections, ITorrentTi
|
||||
|
||||
public bool FlexEq(string test, string target, List<string> years)
|
||||
{
|
||||
var movieTitle = torrentTitleParser.Parse(test).Movie.Title.ToLower();
|
||||
var tvTitle = torrentTitleParser.Parse(test).Show.Title.ToLower();
|
||||
var movieTitle = _torrentTitleParser.Parse(test).Movie.Title.ToLower();
|
||||
var tvTitle = _torrentTitleParser.Parse(test).Show.Title.ToLower();
|
||||
|
||||
var target2 = WhitespaceMatcher().Replace(target, "");
|
||||
var test2 = WhitespaceMatcher().Replace(test, "");
|
||||
@@ -247,7 +260,7 @@ public partial class ParsingService(IWordCollections wordCollections, ITorrentTi
|
||||
return false;
|
||||
}
|
||||
|
||||
var keyTerms = splits.Where(s => (s.Length > 1 && !wordCollections.CommonWords.Contains(s)) || s.Length > 5).ToList();
|
||||
var keyTerms = splits.Where(s => (s.Length > 1 && !_wordCollections.CommonWords.Contains(s)) || s.Length > 5).ToList();
|
||||
keyTerms.AddRange(target.Split(WhitespaceSeparator, StringSplitOptions.RemoveEmptyEntries).Where(e => e.Length > 2));
|
||||
var keySet = new HashSet<string>(keyTerms);
|
||||
var commonTerms = splits.Where(s => !keySet.Contains(s)).ToList();
|
||||
@@ -296,42 +309,30 @@ public partial class ParsingService(IWordCollections wordCollections, ITorrentTi
|
||||
|
||||
public bool HasNoBannedTerms(string targetTitle, string testTitle)
|
||||
{
|
||||
var words = WordMatcher().Split(testTitle.ToLower()).Where(word => word.Length > 3).ToList();
|
||||
var normalisedTitle = targetTitle.NormalizeTitle();
|
||||
|
||||
var hasBannedWords = words.Any(word => !targetTitle.Contains(word) && wordCollections.AdultWords.Contains(word));
|
||||
var normalisedWords = normalisedTitle.Split(' ');
|
||||
|
||||
var titleWithoutSymbols = string.Join(' ', WordMatcher().Split(testTitle.ToLower()));
|
||||
var hasBannedWords = normalisedWords.Where(word => word.Length >= 3).Any(word => !targetTitle.Contains(word) && _wordCollections.AdultWords.Contains(word));
|
||||
|
||||
var hasJavWords = wordCollections.Jav.Any(jav => !targetTitle.Contains(jav) && titleWithoutSymbols.Contains(jav));
|
||||
|
||||
var hasAdultStars = wordCollections.AdultStars.Any(star => !targetTitle.Contains(star) && titleWithoutSymbols.Contains(star));
|
||||
|
||||
var hasBannedCompoundWords = wordCollections.AdultCompoundPhrases.Any(compoundWord => !targetTitle.Contains(compoundWord) && titleWithoutSymbols.Contains(compoundWord));
|
||||
var hasCompounds = _compoundWords.Any(term => normalisedTitle.Contains(term, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
return !hasBannedWords &&
|
||||
!hasJavWords &&
|
||||
!hasAdultStars &&
|
||||
!hasBannedCompoundWords;
|
||||
!hasCompounds;
|
||||
}
|
||||
|
||||
public bool HasNoBannedTerms(string targetTitle)
|
||||
{
|
||||
var words = WordMatcher().Split(targetTitle.ToLower()).ToList();
|
||||
var normalisedTitle = targetTitle.NormalizeTitle();
|
||||
|
||||
var hasBannedWords = words.Any(word => wordCollections.AdultWords.Contains(word));
|
||||
var normalisedWords = normalisedTitle.Split(' ');
|
||||
|
||||
var inputWithoutSymbols = string.Join(' ', WordMatcher().Split(targetTitle.ToLower()));
|
||||
var hasBannedWords = normalisedWords.Where(word => word.Length >= 3).Any(word => normalisedWords.Contains(word, StringComparer.OrdinalIgnoreCase) && _wordCollections.AdultWords.Contains(word));
|
||||
|
||||
var hasJavWords = wordCollections.Jav.Any(jav => inputWithoutSymbols.Contains(jav, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var hasAdultStars = wordCollections.AdultStars.Any(star => inputWithoutSymbols.Contains(star, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var hasBannedCompoundWords = wordCollections.AdultCompoundPhrases.Any(compoundWord => inputWithoutSymbols.Contains(compoundWord, StringComparison.OrdinalIgnoreCase));
|
||||
var hasCompounds = _compoundWords.Any(term => normalisedTitle.Contains(term, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
return !hasBannedWords &&
|
||||
!hasJavWords &&
|
||||
!hasAdultStars &&
|
||||
!hasBannedCompoundWords;
|
||||
!hasCompounds;
|
||||
}
|
||||
|
||||
public bool MeetsTitleConditions(string targetTitle, List<string> years, string testTitle) => MatchesTitle(targetTitle, years, testTitle) && HasNoBannedTerms(targetTitle, testTitle);
|
||||
@@ -343,10 +344,10 @@ public partial class ParsingService(IWordCollections wordCollections, ITorrentTi
|
||||
.Where(word => word.Length > 3)
|
||||
.ToList();
|
||||
|
||||
return processedTitle.Count(word => !wordCollections.CommonWords.Contains(word));
|
||||
return processedTitle.Count(word => !_wordCollections.CommonWords.Contains(word));
|
||||
}
|
||||
|
||||
public ParsedFilename Parse(string name) => torrentTitleParser.Parse(name);
|
||||
public ParsedFilename Parse(string name) => _torrentTitleParser.Parse(name);
|
||||
|
||||
public TorrentType GetTypeByName(string name) => torrentTitleParser.GetTypeByName(name);
|
||||
public TorrentType GetTypeByName(string name) => _torrentTitleParser.GetTypeByName(name);
|
||||
}
|
||||
|
||||
31
src/tissue/Configuration/logging.json
Normal file
31
src/tissue/Configuration/logging.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"Serilog": {
|
||||
"Using": [ "Serilog.Sinks.Console" ],
|
||||
"MinimumLevel": {
|
||||
"Default": "Information",
|
||||
"Override": {
|
||||
"Microsoft": "Warning",
|
||||
"System": "Warning",
|
||||
"System.Net.Http.HttpClient.Scraper.LogicalHandler": "Warning",
|
||||
"System.Net.Http.HttpClient.Scraper.ClientHandler": "Warning",
|
||||
"Quartz.Impl.StdSchedulerFactory": "Warning",
|
||||
"Quartz.Core.QuartzScheduler": "Warning",
|
||||
"Quartz.Simpl.RAMJobStore": "Warning",
|
||||
"Quartz.Core.JobRunShell": "Warning",
|
||||
"Quartz.Core.SchedulerSignalerImpl": "Warning"
|
||||
}
|
||||
},
|
||||
"WriteTo": [
|
||||
{
|
||||
"Name": "Console",
|
||||
"Args": {
|
||||
"outputTemplate": "{Timestamp:HH:mm:ss} [{Level}] [{SourceContext}] {Message}{NewLine}{Exception}"
|
||||
}
|
||||
}
|
||||
],
|
||||
"Enrich": [ "FromLogContext", "WithMachineName", "WithThreadId" ],
|
||||
"Properties": {
|
||||
"Application": "Producer"
|
||||
}
|
||||
}
|
||||
}
|
||||
3004
src/tissue/Data/adult-compound-words.txt
Normal file
3004
src/tissue/Data/adult-compound-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
17681
src/tissue/Data/adult-stars.txt
Normal file
17681
src/tissue/Data/adult-stars.txt
Normal file
File diff suppressed because it is too large
Load Diff
1670
src/tissue/Data/adult-words.txt
Normal file
1670
src/tissue/Data/adult-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
234602
src/tissue/Data/jav.txt
Normal file
234602
src/tissue/Data/jav.txt
Normal file
File diff suppressed because it is too large
Load Diff
17
src/tissue/Dockerfile
Normal file
17
src/tissue/Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:8.0 AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
WORKDIR /App
|
||||
COPY . ./
|
||||
RUN dotnet restore -a $TARGETARCH
|
||||
RUN dotnet publish -c Release --no-restore -o out -a $TARGETARCH
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine
|
||||
|
||||
WORKDIR /App
|
||||
COPY --from=build /App/out .
|
||||
RUN addgroup -S tissue && adduser -S -G tissue tissue
|
||||
USER tissue
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD pgrep -f dotnet || exit 1
|
||||
ENTRYPOINT ["dotnet", "Tissue.dll"]
|
||||
44
src/tissue/Extensions/ConfigurationExtensions.cs
Normal file
44
src/tissue/Extensions/ConfigurationExtensions.cs
Normal file
@@ -0,0 +1,44 @@
|
||||
namespace Tissue.Extensions;
|
||||
|
||||
public static class ConfigurationExtensions
|
||||
{
|
||||
private const string ConfigurationFolder = "Configuration";
|
||||
private const string LoggingConfig = "logging.json";
|
||||
|
||||
public static IConfigurationBuilder AddScrapeConfiguration(this IConfigurationBuilder configuration)
|
||||
{
|
||||
configuration.SetBasePath(Path.Combine(AppContext.BaseDirectory, ConfigurationFolder));
|
||||
|
||||
configuration.AddJsonFile(LoggingConfig, false, true);
|
||||
|
||||
configuration.AddEnvironmentVariables();
|
||||
|
||||
configuration.AddUserSecrets<Program>();
|
||||
|
||||
return configuration;
|
||||
}
|
||||
|
||||
public static TConfiguration LoadConfigurationFromConfig<TConfiguration>(this IServiceCollection services, IConfiguration configuration, string sectionName)
|
||||
where TConfiguration : class
|
||||
{
|
||||
var instance = configuration.GetSection(sectionName).Get<TConfiguration>();
|
||||
|
||||
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
|
||||
|
||||
services.TryAddSingleton(instance);
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
public static TConfiguration LoadConfigurationFromEnv<TConfiguration>(this IServiceCollection services)
|
||||
where TConfiguration : class
|
||||
{
|
||||
var instance = Activator.CreateInstance<TConfiguration>();
|
||||
|
||||
ArgumentNullException.ThrowIfNull(instance, nameof(instance));
|
||||
|
||||
services.TryAddSingleton(instance);
|
||||
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
68
src/tissue/Extensions/EnvironmentExtensions.cs
Normal file
68
src/tissue/Extensions/EnvironmentExtensions.cs
Normal file
@@ -0,0 +1,68 @@
|
||||
namespace Tissue.Extensions;
|
||||
|
||||
public static class EnvironmentExtensions
|
||||
{
|
||||
public static bool GetEnvironmentVariableAsBool(this string prefix, string varName, bool fallback = false)
|
||||
{
|
||||
var fullVarName = GetFullVariableName(prefix, varName);
|
||||
|
||||
var str = Environment.GetEnvironmentVariable(fullVarName);
|
||||
|
||||
if (string.IsNullOrEmpty(str))
|
||||
{
|
||||
return fallback;
|
||||
}
|
||||
|
||||
return str.Trim().ToLower() switch
|
||||
{
|
||||
"true" => true,
|
||||
"yes" => true,
|
||||
"1" => true,
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
|
||||
public static int GetEnvironmentVariableAsInt(this string prefix, string varName, int fallback = 0)
|
||||
{
|
||||
var fullVarName = GetFullVariableName(prefix, varName);
|
||||
|
||||
var str = Environment.GetEnvironmentVariable(fullVarName);
|
||||
|
||||
if (string.IsNullOrEmpty(str))
|
||||
{
|
||||
return fallback;
|
||||
}
|
||||
|
||||
return int.TryParse(str, out var result) ? result : fallback;
|
||||
}
|
||||
|
||||
public static string GetRequiredEnvironmentVariableAsString(this string prefix, string varName)
|
||||
{
|
||||
var fullVarName = GetFullVariableName(prefix, varName);
|
||||
|
||||
var str = Environment.GetEnvironmentVariable(fullVarName);
|
||||
|
||||
if (string.IsNullOrEmpty(str))
|
||||
{
|
||||
throw new InvalidOperationException($"Environment variable {fullVarName} is not set");
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
public static string GetOptionalEnvironmentVariableAsString(this string prefix, string varName, string? fallback = null)
|
||||
{
|
||||
var fullVarName = GetFullVariableName(prefix, varName);
|
||||
|
||||
var str = Environment.GetEnvironmentVariable(fullVarName);
|
||||
|
||||
if (string.IsNullOrEmpty(str))
|
||||
{
|
||||
return fallback;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
private static string GetFullVariableName(string prefix, string varName) => $"{prefix}_{varName}";
|
||||
}
|
||||
28
src/tissue/Extensions/ServiceCollectionExtensions.cs
Normal file
28
src/tissue/Extensions/ServiceCollectionExtensions.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
namespace Tissue.Extensions;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
public static IServiceCollection AddSerilogLogging(this IServiceCollection services, IConfiguration configuration)
|
||||
{
|
||||
services.AddLogging(
|
||||
loggingBuilder =>
|
||||
{
|
||||
loggingBuilder.ClearProviders();
|
||||
|
||||
var logger = new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(configuration)
|
||||
.CreateLogger();
|
||||
|
||||
loggingBuilder.AddSerilog(logger);
|
||||
});
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
public static IServiceCollection AddKleenexService(this IServiceCollection services)
|
||||
{
|
||||
services.AddHostedService<KleenexService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
24
src/tissue/Extensions/StringExtensions.cs
Normal file
24
src/tissue/Extensions/StringExtensions.cs
Normal file
@@ -0,0 +1,24 @@
|
||||
namespace Tissue.Extensions;
|
||||
|
||||
public static partial class StringExtensions
|
||||
{
|
||||
[GeneratedRegex("[^a-zA-Z0-9 ]")]
|
||||
private static partial Regex NotAlphaNumeric();
|
||||
|
||||
public static bool IsNullOrEmpty(this string? value) =>
|
||||
string.IsNullOrEmpty(value);
|
||||
|
||||
private static readonly char[] separator = [' '];
|
||||
|
||||
public static string NormalizeTitle(this string title)
|
||||
{
|
||||
var alphanumericTitle = NotAlphaNumeric().Replace(title, " ");
|
||||
|
||||
var words = alphanumericTitle.Split(separator, StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(word => word.ToLower());
|
||||
|
||||
var normalizedTitle = string.Join(" ", words);
|
||||
|
||||
return normalizedTitle;
|
||||
}
|
||||
}
|
||||
44
src/tissue/Features/DataProcessing/DapperDataStorage.cs
Normal file
44
src/tissue/Features/DataProcessing/DapperDataStorage.cs
Normal file
@@ -0,0 +1,44 @@
|
||||
namespace Tissue.Features.DataProcessing;
|
||||
|
||||
public class DapperDataStorage(PostgresConfiguration configuration, ILogger<DapperDataStorage> logger) : IDataStorage
|
||||
{
|
||||
public async Task<IReadOnlyCollection<Torrent>?> GetAllTorrents(CancellationToken cancellationToken = default)
|
||||
{
|
||||
const string GetAllTorrentsSql = "SELECT * FROM torrents";
|
||||
|
||||
try
|
||||
{
|
||||
await using var connection = await CreateAndOpenConnection(cancellationToken);
|
||||
var torrents = await connection.QueryAsync<Torrent>(GetAllTorrentsSql);
|
||||
|
||||
return torrents.ToList();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogError(e, "Error while torrents from database");
|
||||
return new List<Torrent>();
|
||||
}
|
||||
}
|
||||
|
||||
public async Task DeleteTorrentsByInfoHashes(IReadOnlyCollection<string> infoHashes, CancellationToken cancellationToken = default)
|
||||
{
|
||||
const string deleteTorrentsSql = "DELETE FROM torrents WHERE \"infoHash\" = ANY(@infoHashes)";
|
||||
|
||||
try
|
||||
{
|
||||
await using var connection = await CreateAndOpenConnection(cancellationToken);
|
||||
await connection.ExecuteAsync(deleteTorrentsSql, new { infoHashes });
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogError(e, "Error while deleting torrents from database");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<NpgsqlConnection> CreateAndOpenConnection(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var connection = new NpgsqlConnection(configuration.StorageConnectionString);
|
||||
await connection.OpenAsync(cancellationToken);
|
||||
return connection;
|
||||
}
|
||||
}
|
||||
7
src/tissue/Features/DataProcessing/IDataStorage.cs
Normal file
7
src/tissue/Features/DataProcessing/IDataStorage.cs
Normal file
@@ -0,0 +1,7 @@
|
||||
namespace Tissue.Features.DataProcessing;
|
||||
|
||||
public interface IDataStorage
|
||||
{
|
||||
Task<IReadOnlyCollection<Torrent>?> GetAllTorrents(CancellationToken cancellationToken = default);
|
||||
Task DeleteTorrentsByInfoHashes(IReadOnlyCollection<string> infoHashes, CancellationToken cancellationToken = default);
|
||||
}
|
||||
19
src/tissue/Features/DataProcessing/PostgresConfiguration.cs
Normal file
19
src/tissue/Features/DataProcessing/PostgresConfiguration.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
namespace Tissue.Features.DataProcessing;
|
||||
|
||||
public class PostgresConfiguration
|
||||
{
|
||||
private const string Prefix = "POSTGRES";
|
||||
private const string HostVariable = "HOST";
|
||||
private const string UsernameVariable = "USER";
|
||||
private const string PasswordVariable = "PASSWORD";
|
||||
private const string DatabaseVariable = "DB";
|
||||
private const string PortVariable = "PORT";
|
||||
|
||||
private string Host { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(HostVariable);
|
||||
private string Username { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(UsernameVariable);
|
||||
private string Password { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(PasswordVariable);
|
||||
private string Database { get; init; } = Prefix.GetRequiredEnvironmentVariableAsString(DatabaseVariable);
|
||||
private int PORT { get; init; } = Prefix.GetEnvironmentVariableAsInt(PortVariable, 5432);
|
||||
|
||||
public string StorageConnectionString => $"Host={Host};Port={PORT};Username={Username};Password={Password};Database={Database};";
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
namespace Tissue.Features.DataProcessing;
|
||||
|
||||
internal static class ServiceCollectionExtensions
|
||||
{
|
||||
internal static IServiceCollection AddDataStorage(this IServiceCollection services)
|
||||
{
|
||||
services.LoadConfigurationFromEnv<PostgresConfiguration>();
|
||||
services.AddTransient<IDataStorage, DapperDataStorage>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
20
src/tissue/Features/DataProcessing/Torrent.cs
Normal file
20
src/tissue/Features/DataProcessing/Torrent.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
namespace Tissue.Features.DataProcessing;
|
||||
|
||||
public class Torrent
|
||||
{
|
||||
public string? InfoHash { get; set; }
|
||||
public string? Provider { get; set; }
|
||||
public string? TorrentId { get; set; }
|
||||
public string? Title { get; set; }
|
||||
public long? Size { get; set; }
|
||||
public string? Type { get; set; }
|
||||
public DateTime UploadDate { get; set; }
|
||||
public short? Seeders { get; set; }
|
||||
public string? Trackers { get; set; }
|
||||
public string? Languages { get; set; }
|
||||
public string? Resolution { get; set; }
|
||||
public bool Reviewed { get; set; }
|
||||
public bool Opened { get; set; }
|
||||
public DateTime CreatedAt { get; set; }
|
||||
public DateTime UpdatedAt { get; set; }
|
||||
}
|
||||
93
src/tissue/Features/Kleenex/KleenexService.cs
Normal file
93
src/tissue/Features/Kleenex/KleenexService.cs
Normal file
@@ -0,0 +1,93 @@
|
||||
namespace Tissue.Features.Kleenex;
|
||||
|
||||
public class KleenexService(IDataStorage dataStorage, ILogger<KleenexService> logger, IWordCollections wordCollections) : IHostedService
|
||||
{
|
||||
private HashSet<string> _combinedCompounds = [];
|
||||
|
||||
public async Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
logger.LogInformation("Kleenex service started");
|
||||
logger.LogInformation("Get ready to pop all that corn...");
|
||||
|
||||
var torrents = await LoadTorrents(cancellationToken);
|
||||
|
||||
_combinedCompounds.UnionWith(wordCollections.AdultCompoundPhrases);
|
||||
_combinedCompounds.UnionWith(wordCollections.Jav);
|
||||
_combinedCompounds.UnionWith(wordCollections.AdultStars);
|
||||
|
||||
var infoHashesToDelete = GetInfoHashesToDelete(torrents);
|
||||
|
||||
await dataStorage.DeleteTorrentsByInfoHashes(infoHashesToDelete!, cancellationToken);
|
||||
|
||||
logger.LogInformation("Deleted {TorrentCount} torrents", infoHashesToDelete.Count);
|
||||
|
||||
logger.LogInformation("Kleenex service completed successfully");
|
||||
|
||||
Environment.Exit(0);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogError(e, "Error while processing torrents");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
public Task StopAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
logger.LogInformation("Service Shutdown");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private List<string?> GetInfoHashesToDelete(IReadOnlyCollection<Torrent> torrents)
|
||||
{
|
||||
var torrentsToDelete = torrents.Where(torrent => HasBannedTerms(torrent.Title)).ToList();
|
||||
var infoHashesToDelete = torrentsToDelete.Select(torrent => torrent.InfoHash).ToList();
|
||||
|
||||
if (infoHashesToDelete.Count == 0)
|
||||
{
|
||||
logger.LogInformation("No torrents to delete");
|
||||
Environment.Exit(0);
|
||||
}
|
||||
|
||||
return infoHashesToDelete;
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyCollection<Torrent>> LoadTorrents(CancellationToken cancellationToken)
|
||||
{
|
||||
var torrents = await dataStorage.GetAllTorrents(cancellationToken);
|
||||
|
||||
if (torrents is null || torrents.Count == 0)
|
||||
{
|
||||
logger.LogInformation("No torrents found");
|
||||
Environment.Exit(0);
|
||||
}
|
||||
|
||||
logger.LogInformation("Found {TorrentCount} torrents", torrents.Count);
|
||||
return torrents;
|
||||
}
|
||||
|
||||
private bool HasBannedTerms(string targetTitle)
|
||||
{
|
||||
var normalisedTitle = targetTitle.NormalizeTitle();
|
||||
|
||||
var normalisedWords = normalisedTitle.Split(' ');
|
||||
|
||||
var hasBannedWords = normalisedWords.Where(word => word.Length >= 3).Any(word => normalisedWords.Contains(word, StringComparer.OrdinalIgnoreCase) && wordCollections.AdultWords.Contains(word));
|
||||
|
||||
var hasCompounds = _combinedCompounds.Any(term => normalisedTitle.Contains(term, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var isClean = !hasBannedWords &&
|
||||
!hasCompounds;
|
||||
|
||||
if (isClean)
|
||||
{
|
||||
logger.LogInformation("No banned terms found in torrent title: {Title}", targetTitle);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.LogWarning("Banned terms found in torrent title: {Title}", targetTitle);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
14
src/tissue/Features/Wordlists/IWordCollections.cs
Normal file
14
src/tissue/Features/Wordlists/IWordCollections.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
namespace Tissue.Features.Wordlists;
|
||||
|
||||
public interface IWordCollections
|
||||
{
|
||||
HashSet<string> AdultWords { get; }
|
||||
|
||||
HashSet<string> AdultCompoundPhrases { get; }
|
||||
|
||||
HashSet<string> Jav { get; }
|
||||
|
||||
HashSet<string> AdultStars { get; }
|
||||
|
||||
Task LoadAsync();
|
||||
}
|
||||
20
src/tissue/Features/Wordlists/PopulationService.cs
Normal file
20
src/tissue/Features/Wordlists/PopulationService.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
namespace Tissue.Features.Wordlists;
|
||||
|
||||
public class PopulationService(IWordCollections wordCollections, ILogger<PopulationService> logger) : IHostedService
|
||||
{
|
||||
public async Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
logger.LogInformation("Loading word collections...");
|
||||
|
||||
await wordCollections.LoadAsync();
|
||||
|
||||
logger.LogInformation("Adult Words Count: {Count}", wordCollections.AdultWords.Count);
|
||||
logger.LogInformation("Adult Compound Phrases Count: {Count}", wordCollections.AdultCompoundPhrases.Count);
|
||||
logger.LogInformation("Jav Count: {Count}", wordCollections.Jav.Count);
|
||||
logger.LogInformation("Adult Stars Count: {Count}", wordCollections.AdultStars.Count);
|
||||
|
||||
logger.LogInformation("Word collections loaded.");
|
||||
}
|
||||
|
||||
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
}
|
||||
12
src/tissue/Features/Wordlists/ServiceCollectionExtensions.cs
Normal file
12
src/tissue/Features/Wordlists/ServiceCollectionExtensions.cs
Normal file
@@ -0,0 +1,12 @@
|
||||
namespace Tissue.Features.Wordlists;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
public static IServiceCollection RegisterWordCollections(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<IWordCollections, WordCollections>();
|
||||
services.AddHostedService<PopulationService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
55
src/tissue/Features/Wordlists/WordCollections.cs
Normal file
55
src/tissue/Features/Wordlists/WordCollections.cs
Normal file
@@ -0,0 +1,55 @@
|
||||
namespace Tissue.Features.Wordlists;
|
||||
|
||||
public class WordCollections : IWordCollections
|
||||
{
|
||||
private const string AdultWordsFile = "adult-words.txt";
|
||||
private const string AdultCompoundPhrasesFile = "adult-compound-words.txt";
|
||||
private const string AdultStarsFile = "adult-stars.txt";
|
||||
private const string JavFile = "jav.txt";
|
||||
|
||||
public HashSet<string> AdultWords { get; private set; } = [];
|
||||
public HashSet<string> AdultCompoundPhrases { get; private set; } = [];
|
||||
|
||||
public HashSet<string> AdultStars { get; private set; } = [];
|
||||
|
||||
public HashSet<string> Jav { get; private set; } = [];
|
||||
|
||||
public async Task LoadAsync()
|
||||
{
|
||||
var loaderTasks = new List<Task>
|
||||
{
|
||||
LoadAdultWords(),
|
||||
LoadAdultCompounds(),
|
||||
LoadJav(),
|
||||
LoadAdultStars(),
|
||||
};
|
||||
|
||||
await Task.WhenAll(loaderTasks);
|
||||
}
|
||||
|
||||
private async Task LoadAdultCompounds()
|
||||
{
|
||||
var adultCompoundWords = await File.ReadAllLinesAsync(GetPath(AdultCompoundPhrasesFile));
|
||||
AdultCompoundPhrases = [..adultCompoundWords];
|
||||
}
|
||||
|
||||
private async Task LoadAdultWords()
|
||||
{
|
||||
var adultWords = await File.ReadAllLinesAsync(GetPath(AdultWordsFile));
|
||||
AdultWords = [..adultWords];
|
||||
}
|
||||
|
||||
private async Task LoadJav()
|
||||
{
|
||||
var jav = await File.ReadAllLinesAsync(GetPath(JavFile));
|
||||
Jav = [..jav];
|
||||
}
|
||||
|
||||
private async Task LoadAdultStars()
|
||||
{
|
||||
var adultStars = await File.ReadAllLinesAsync(GetPath(AdultStarsFile));
|
||||
AdultStars = [..adultStars];
|
||||
}
|
||||
|
||||
private static string GetPath(string fileName) => Path.Combine(AppContext.BaseDirectory, "Data", fileName);
|
||||
}
|
||||
13
src/tissue/GlobalUsings.cs
Normal file
13
src/tissue/GlobalUsings.cs
Normal file
@@ -0,0 +1,13 @@
|
||||
// Global using directives
|
||||
|
||||
global using System.Text.RegularExpressions;
|
||||
global using Dapper;
|
||||
global using Microsoft.Extensions.DependencyInjection;
|
||||
global using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
global using Microsoft.Extensions.Logging;
|
||||
global using Npgsql;
|
||||
global using Serilog;
|
||||
global using Tissue.Extensions;
|
||||
global using Tissue.Features.DataProcessing;
|
||||
global using Tissue.Features.Kleenex;
|
||||
global using Tissue.Features.Wordlists;
|
||||
14
src/tissue/Program.cs
Normal file
14
src/tissue/Program.cs
Normal file
@@ -0,0 +1,14 @@
|
||||
var builder = Host.CreateApplicationBuilder();
|
||||
|
||||
builder.Configuration
|
||||
.AddScrapeConfiguration();
|
||||
|
||||
builder.Services
|
||||
.AddDataStorage()
|
||||
.RegisterWordCollections()
|
||||
.AddSerilogLogging(builder.Configuration)
|
||||
.AddKleenexService();
|
||||
|
||||
var host = builder.Build();
|
||||
|
||||
await host.RunAsync();
|
||||
35
src/tissue/Tissue.csproj
Normal file
35
src/tissue/Tissue.csproj
Normal file
@@ -0,0 +1,35 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk.Worker">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<UserSecretsId>54cad2ee-57df-4bb2-a192-d5d501448e0b</UserSecretsId>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Dapper" Version="2.1.28" />
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
|
||||
<PackageReference Include="Npgsql" Version="8.0.1" />
|
||||
<PackageReference Include="Serilog" Version="3.1.1" />
|
||||
<PackageReference Include="Serilog.AspNetCore" Version="8.0.1" />
|
||||
<PackageReference Include="Serilog.Sinks.Console" Version="5.0.1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Remove="Configuration\*.json" />
|
||||
<None Include="Configuration\*.json">
|
||||
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Remove="Data\**" />
|
||||
<None Include="Data\**">
|
||||
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user