[skip ci] More work on parsing - seasons to fix still and use banned words

This commit is contained in:
iPromKnight
2024-03-10 03:52:21 +00:00
parent 51246ed352
commit 320fccc8e8
26 changed files with 177894 additions and 4898 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -12,8 +12,7 @@ public static class ConfigurationExtensions
configuration.AddJsonFile(LoggingConfig, false, true);
configuration.AddJsonFile(ScrapeConfiguration.Filename, false, true);
configuration.AddJsonFile(TorrentioConfiguration.Filename, false, true);
configuration.AddJsonFile(AdultContentConfiguration.Filename, false, true);
configuration.AddEnvironmentVariables();
configuration.AddUserSecrets<Program>();

View File

@@ -4,34 +4,19 @@ public partial class DebridMediaManagerCrawler(
IHttpClientFactory httpClientFactory,
ILogger<DebridMediaManagerCrawler> logger,
IDataStorage storage,
GithubConfiguration githubConfiguration,
AdultContentConfiguration adultContentConfiguration,
IServiceProvider serviceProvider) : BaseCrawler(logger, storage)
GithubConfiguration githubConfiguration) : BaseCrawler(logger, storage)
{
[GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
private static partial Regex HashCollectionMatcher();
[GeneratedRegex(@"[sS]([0-9]{1,2})|seasons?[\s-]?([0-9]{1,2})", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex SeasonMatcher();
[GeneratedRegex(@"[0-9]{4}", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex YearMatcher();
private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
protected override string Url => "https://api.github.com/repos/debridmediamanager/hashlists/git/trees/main?recursive=1";
protected override string Source => "DMM";
private IFuzzySearcher<string>? _adultContentSearcher;
public override async Task Execute()
{
if (!adultContentConfiguration.Allow)
{
_adultContentSearcher = serviceProvider.GetRequiredService<IFuzzySearcher<string>>();
}
var client = httpClientFactory.CreateClient("Scraper");
client.DefaultRequestHeaders.Authorization = new("Bearer", githubConfiguration.PAT);
client.DefaultRequestHeaders.UserAgent.ParseAdd("curl");
@@ -107,14 +92,14 @@ public partial class DebridMediaManagerCrawler(
private Torrent? ParseTorrent(JsonElement item)
{
if (!item.TryGetProperty("filename", out var filenameElement) ||
!item.TryGetProperty("bytes", out var bytesElement) ||
!item.TryGetProperty("hash", out var hashElement))
{
return null;
}
var torrent = new Torrent
{
Source = Source,
@@ -130,45 +115,37 @@ public partial class DebridMediaManagerCrawler(
return null;
}
torrent.Category = (SeasonMatcher().IsMatch(torrent.Name), YearMatcher().IsMatch(torrent.Name)) switch
var parsedTorrent = TorrentTitleParser.Parse(torrent.Name);
if (parsedTorrent.IsInvalid)
{
(true, _) => "tv",
(_, true) => "movies",
_ => "unknown",
};
return null;
}
return HandleAdultContent(torrent);
}
private Torrent HandleAdultContent(Torrent torrent)
{
try
if (parsedTorrent.IsMovie)
{
if (!adultContentConfiguration.Allow)
{
var adultMatch = _adultContentSearcher!.Search(torrent.Name.Replace(".", " "));
if (adultMatch.Count > 0)
{
logger.LogWarning("Adult content found in {Name}. Marking category as 'xxx'", torrent.Name);
logger.LogWarning("Matches: {TopMatch} {TopScore}", adultMatch.First().Value, adultMatch.First().Score);
torrent.Category = "xxx";
}
}
torrent.Category = "movies";
torrent.Name = parsedTorrent.Movie.Title;
return torrent;
}
catch (Exception e)
if (parsedTorrent.IsShow)
{
logger.LogWarning("Failed to handle adult content for {Name}: [{Error}]. Torrent will not be ingested at this time.", torrent.Name, e.Message);
return null;
torrent.Category = "tv";
torrent.Name = parsedTorrent.Show.Title;
return torrent;
}
return null;
}
private async Task InsertTorrentsForPage(JsonDocument json)
{
var torrents = json.RootElement.EnumerateArray()
.Select(ParseTorrent)
.Where(t => t is not null)
.ToList();
if (torrents.Count == 0)

View File

@@ -1,16 +1,13 @@
namespace Producer.Features.Crawlers.Dmm;
public partial class ParsingService(AdultContentConfiguration adultContentConfiguration)
public partial class ParsingService(IWordCollections wordCollections)
{
private static readonly char[] WhitespaceSeparator = [' '];
//todo: Populate dictionary
private static readonly HashSet<string> Dictionary = new HashSet<string>();
public static string Naked(string title) =>
public string Naked(string title) =>
NakedMatcher().Replace(title.ToLower(), "");
public static List<string> GrabYears(string str)
public List<string> GrabYears(string str)
{
var matches = GrabYearsMatcher().Matches(str);
return matches
@@ -19,7 +16,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
.ToList();
}
public static List<int> GrabPossibleSeasonNums(string str)
public List<int> GrabPossibleSeasonNums(string str)
{
var matches = GrabPossibleSeasonNumsMatcher().Matches(str);
return matches
@@ -28,7 +25,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
.ToList();
}
public static bool HasYear(string test, List<string> years, bool strictCheck = false) =>
public bool HasYear(string test, List<string> years, bool strictCheck = false) =>
strictCheck
? years.Any(test.Contains)
: years.Any(year =>
@@ -39,7 +36,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
test.Contains($"{intYear - 1}");
});
public static string RemoveDiacritics(string str)
public string RemoveDiacritics(string str)
{
var normalizedString = str.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
@@ -56,9 +53,9 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
public static string RemoveRepeats(string str) => RemoveRepeatsMatcher().Replace(str, "$1");
public string RemoveRepeats(string str) => RemoveRepeatsMatcher().Replace(str, "$1");
public static int RomanToDecimal(string roman)
public int RomanToDecimal(string roman)
{
var romanNumerals = new Dictionary<char, int>
{
@@ -84,9 +81,9 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
return total;
}
public static string ReplaceRomanWithDecimal(string input) => ReplaceRomanWithDecimalMatcher().Replace(input, match => RomanToDecimal(match.Value).ToString());
public static bool StrictEqual(string title1, string title2)
public string ReplaceRomanWithDecimal(string input) => ReplaceRomanWithDecimalMatcher().Replace(input, match => RomanToDecimal(match.Value).ToString());
public bool StrictEqual(string title1, string title2)
{
title1 = WhitespaceMatcher().Replace(title1, "");
title2 = WhitespaceMatcher().Replace(title2, "");
@@ -96,8 +93,8 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
(RemoveRepeats(title1).Length > 0 && RemoveRepeats(title1) == RemoveRepeats(title2)) ||
(RemoveDiacritics(title1).Length > 0 && RemoveDiacritics(title1) == RemoveDiacritics(title2));
}
public static int CountTestTermsInTarget(string test, string target, bool shouldBeInSequence = false)
public int CountTestTermsInTarget(string test, string target, bool shouldBeInSequence = false)
{
var replaceCount = 0;
var prevReplaceCount = 0;
@@ -134,7 +131,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
{
var prefix = first ? @"\b" : "";
var suffix = last ? @"\b" : "";
testStr = Regex.Replace(testStr.Substring(prevOffset + prevLength), $"{prefix}{newTerm}{suffix}", replacer);
testStr = Regex.Replace(testStr[(prevOffset + prevLength)..], $"{prefix}{newTerm}{suffix}", replacer);
};
var actual = wordsInTitle.Where((term, idx) =>
@@ -198,11 +195,11 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
}
return actual.Count;
}
public static bool FlexEq(string test, string target, List<string> years)
public bool FlexEq(string test, string target, List<string> years)
{
var movieTitle = TorrentTitleParser.Parse(test).Movie.Title.ToLower();
var tvTitle = TorrentTitleParser.Parse(test, true).Show.Title.ToLower();
var tvTitle = TorrentTitleParser.Parse(test).Show.Title.ToLower();
var target2 = WhitespaceMatcher().Replace(target, "");
var test2 = WhitespaceMatcher().Replace(test, "");
@@ -230,7 +227,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
return StrictEqual(target, movieTitle) || StrictEqual(target, tvTitle);
}
public static bool MatchesTitle(string target, List<string> years, string test)
public bool MatchesTitle(string target, List<string> years, string test)
{
target = target.ToLower();
test = test.ToLower();
@@ -250,7 +247,7 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
return false;
}
var keyTerms = splits.Where(s => (s.Length > 1 && !Dictionary.Contains(s)) || s.Length > 5).ToList();
var keyTerms = splits.Where(s => (s.Length > 1 && !wordCollections.CommonWords.Contains(s)) || s.Length > 5).ToList();
keyTerms.AddRange(target.Split(WhitespaceSeparator, StringSplitOptions.RemoveEmptyEntries).Where(e => e.Length > 2));
var keySet = new HashSet<string>(keyTerms);
var commonTerms = splits.Where(s => !keySet.Contains(s)).ToList();
@@ -269,8 +266,8 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
return Math.Floor(score / 0.85) >= totalScore;
}
public static bool IncludesMustHaveTerms(List<string> mustHaveTerms, string testTitle) =>
public bool IncludesMustHaveTerms(List<string> mustHaveTerms, string testTitle) =>
mustHaveTerms.All(term =>
{
var newTitle = testTitle.Replace(term, "");
@@ -301,24 +298,24 @@ public partial class ParsingService(AdultContentConfiguration adultContentConfig
{
var words = WordMatcher().Split(testTitle.ToLower()).Where(word => word.Length > 3).ToList();
var hasBannedWords = words.Any(word => !targetTitle.Contains(word) && adultContentConfiguration.Keywords.Contains(word));
var hasBannedWords = words.Any(word => !targetTitle.Contains(word) && wordCollections.AdultWords.Contains(word));
var titleWithoutSymbols = string.Join(' ', WordMatcher().Split(testTitle.ToLower()));
var hasBannedCompoundWords = adultContentConfiguration.CompoundKeywords.Any(compoundWord => !targetTitle.Contains(compoundWord) && titleWithoutSymbols.Contains(compoundWord));
var hasBannedCompoundWords = wordCollections.AdultCompoundPhrases.Any(compoundWord => !targetTitle.Contains(compoundWord) && titleWithoutSymbols.Contains(compoundWord));
return !hasBannedWords && !hasBannedCompoundWords;
}
public bool MeetsTitleConditions(string targetTitle, List<string> years, string testTitle) => MatchesTitle(targetTitle, years, testTitle) && HasNoBannedTerms(targetTitle, testTitle);
public static int CountUncommonWords(string title)
public int CountUncommonWords(string title)
{
var processedTitle = WhitespaceMatcher().Split(title)
.Select(word => WordProcessingMatcher().Replace(word.ToLower(), ""))
.Where(word => word.Length > 3)
.ToList();
return processedTitle.Count(word => !Dictionary.Contains(word));
return processedTitle.Count(word => !wordCollections.CommonWords.Contains(word));
}
}
}

View File

@@ -22,12 +22,14 @@ public static partial class GroupParser
var nowebsiteTitle = WebsitePrefixExp().Replace(title, "");
TitleParser.Parse(nowebsiteTitle, out var releaseTitle, out _);
releaseTitle = releaseTitle.Replace(" ", ".");
var trimmed = nowebsiteTitle
.Replace(" ", ".")
.Replace(releaseTitle == nowebsiteTitle ? "" : releaseTitle, "")
.Replace(".-.", ".");
var trimmed = nowebsiteTitle.Replace(" ", ".");
if (releaseTitle != nowebsiteTitle)
{
trimmed = trimmed.Replace(releaseTitle, "");
}
trimmed = trimmed.Replace(".-.", ".");
trimmed = TitleParser.SimplifyTitle(FileExtensionParser.RemoveFileExtension(trimmed.Trim()));
if (trimmed.Length == 0)
@@ -66,4 +68,4 @@ public static partial class GroupParser
return null;
}
}
}

View File

@@ -21,7 +21,7 @@ public partial class ResolutionParser
private static partial Regex R480Exp();
private static readonly Regex ResolutionExp = new(string.Join("|", R2160pExp(), R1080pExp(), R720pExp(), R576pExp(), R540pExp(), R480Exp()), RegexOptions.IgnoreCase);
public static void Parse(string title, out Resolution? resolution, out string? source)
{
resolution = null;
@@ -31,14 +31,16 @@ public partial class ResolutionParser
if (result.Success)
{
foreach (var key in Enum.GetNames(typeof(Resolution)))
foreach (var resolutionEnum in Resolution.List)
{
if (result.Groups[key].Success)
if (!result.Groups[resolutionEnum.Name].Success)
{
resolution = Resolution.FromName(key);
source = result.Groups[key].Value;
return;
continue;
}
resolution = resolutionEnum;
source = result.Groups[resolutionEnum.Name].Value;
return;
}
}
@@ -50,4 +52,4 @@ public partial class ResolutionParser
resolution = Resolution.R480P;
}
}
}
}

View File

@@ -8,7 +8,7 @@ public static partial class SeasonParser
[GeneratedRegex(@"(?<=[_.-])(?<airdate>(?<!\d)(?<airyear>[1-9]\d{1})(?<airmonth>[0-1][0-9])(?<airday>[0-3][0-9]))(?=[_.-])", RegexOptions.IgnoreCase)]
private static partial Regex SixDigitAirDateMatchExp();
public static Season Parse(string title)
public static Season? Parse(string title)
{
if (!PreValidation(title))
{
@@ -30,7 +30,10 @@ public static partial class SeasonParser
{
var fixedDate = $"20{airYear}.{airMonth}.{airDay}";
simpleTitle = simpleTitle.Replace(sixDigitAirDateMatch.Groups["airdate"]?.Value ?? "", fixedDate);
if (!string.IsNullOrEmpty(sixDigitAirDateMatch.Groups["airdate"].Value))
{
simpleTitle = simpleTitle.Replace(sixDigitAirDateMatch.Groups["airdate"].Value, fixedDate);
}
}
}
@@ -38,32 +41,33 @@ public static partial class SeasonParser
{
var match = exp().Match(simpleTitle);
if (match.Groups.Count > 0)
if (match.Groups.Count <= 0 || !match.Success)
{
var result = ParseMatchCollection(match, simpleTitle);
if (result.FullSeason && result.ReleaseTokens != null && result.ReleaseTokens.Contains("Special", StringComparison.OrdinalIgnoreCase))
{
result.FullSeason = false;
result.IsSpecial = true;
}
return new()
{
ReleaseTitle = title,
SeriesTitle = result.SeriesName,
// SeriesTitleInfo = 0,
Seasons = result.SeasonNumbers ?? [],
EpisodeNumbers = result.EpisodeNumbers ?? [],
AirDate = result.AirDate,
FullSeason = result.FullSeason,
IsPartialSeason = result.IsPartialSeason ?? false,
IsMultiSeason = result.IsMultiSeason ?? false,
IsSeasonExtra = result.IsSeasonExtra ?? false,
IsSpecial = result.IsSpecial ?? false,
SeasonPart = result.SeasonPart ?? 0,
};
continue;
}
var result = ParseMatchCollection(match, simpleTitle);
if (result.FullSeason && result.ReleaseTokens != null && result.ReleaseTokens.Contains("Special", StringComparison.OrdinalIgnoreCase))
{
result.FullSeason = false;
result.IsSpecial = true;
}
return new()
{
ReleaseTitle = title,
SeriesTitle = result.SeriesName,
Seasons = result.SeasonNumbers ?? [],
EpisodeNumbers = result.EpisodeNumbers ?? [],
AirDate = result.AirDate,
FullSeason = result.FullSeason,
IsPartialSeason = result.IsPartialSeason ?? false,
IsMultiSeason = result.IsMultiSeason ?? false,
IsSeasonExtra = result.IsSeasonExtra ?? false,
IsSpecial = result.IsSpecial ?? false,
SeasonPart = result.SeasonPart ?? 0,
};
}
return null;
@@ -78,7 +82,7 @@ public static partial class SeasonParser
throw new("No match");
}
var seriesName = (groups["title"]?.Value ?? "")
var seriesName = groups["title"].Value
.Replace(".", " ")
.Replace("_", " ")
.Replace(RequestInfoExp().ToString(), "")
@@ -89,9 +93,9 @@ public static partial class SeasonParser
SeriesName = seriesName,
};
var lastSeasonEpisodeStringIndex = IndexOfEnd(simpleTitle, groups["title"]?.Value ?? "");
var lastSeasonEpisodeStringIndex = IndexOfEnd(simpleTitle, groups["title"].Value);
if (int.TryParse(groups["airyear"]?.Value, out var airYear) && airYear >= 1900)
if (int.TryParse(groups["airyear"].Value, out var airYear) && airYear >= 1900)
{
var seasons = new List<string> {groups["season"]?.Value, groups["season1"]?.Value}
.Where(x => !string.IsNullOrEmpty(x))
@@ -300,4 +304,4 @@ public static partial class SeasonParser
public DateTime? AirDate { get; set; }
public string? ReleaseTokens { get; set; }
}
}
}

View File

@@ -88,16 +88,16 @@ public static partial class TitleParser
VideoCodecsParser.Parse(title, out var videoCodec, out _);
AudioChannelsParser.Parse(title, out var channels, out _);
AudioCodecsParser.Parse(title, out var audioCodec, out _);
var resolutionPosition = title.IndexOf(resolution.Value ?? string.Empty, StringComparison.Ordinal);
var videoCodecPosition = title.IndexOf(videoCodec.Value ?? string.Empty, StringComparison.Ordinal);
var channelsPosition = title.IndexOf(channels.Value ?? string.Empty, StringComparison.Ordinal);
var audioCodecPosition = title.IndexOf(audioCodec.Value ?? string.Empty, StringComparison.Ordinal);
var resolutionPosition = title.IndexOf(resolution?.Value ?? string.Empty, StringComparison.Ordinal);
var videoCodecPosition = title.IndexOf(videoCodec?.Value ?? string.Empty, StringComparison.Ordinal);
var channelsPosition = title.IndexOf(channels?.Value ?? string.Empty, StringComparison.Ordinal);
var audioCodecPosition = title.IndexOf(audioCodec?.Value ?? string.Empty, StringComparison.Ordinal);
var positions = new List<int> {resolutionPosition, audioCodecPosition, channelsPosition, videoCodecPosition}.Where(x => x > 0).ToList();
if (positions.Count != 0)
{
var firstPosition = positions.Min();
parsedTitle = ReleaseTitleCleaner(title[..firstPosition]) ?? string.Empty;
parsedTitle = ReleaseTitleCleaner(title[..firstPosition]);
year = null;
return;
}
@@ -149,10 +149,7 @@ public static partial class TitleParser
trimmedTitle = trimmedTitle.Replace(LanguageExp().ToString(), "").Trim();
trimmedTitle = trimmedTitle.Replace(SceneGarbageExp().ToString(), "").Trim();
foreach (var lang in Enum.GetValues(typeof(Language)).Cast<Language>())
{
trimmedTitle = trimmedTitle.Replace($@"\b{lang.ToString().ToUpper()}", "").Trim();
}
trimmedTitle = Language.List.Aggregate(trimmedTitle, (current, lang) => current.Replace($@"\b{lang.Value.ToUpper()}", "").Trim());
// Look for gap formed by removing items
trimmedTitle = trimmedTitle.Split(" ")[0];
@@ -168,7 +165,7 @@ public static partial class TitleParser
{
if (parts.Length >= n + 2)
{
nextPart = parts[n + 1] ?? "";
nextPart = parts[n + 1];
}
if (part.Length == 1 && part.ToLower() != "a" && !int.TryParse(part, out _))
@@ -197,4 +194,4 @@ public static partial class TitleParser
return result.Trim();
}
}
}

View File

@@ -1,8 +1,21 @@
namespace Producer.Features.ParseTorrentTitle;
public static class TorrentTitleParser
public static partial class TorrentTitleParser
{
public static ParsedFilename Parse(string name, bool isTv = false)
[GeneratedRegex(@"(season|episode)s?.?\d?", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex SeasonEpisode();
[GeneratedRegex(@"[se]\d\d", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex SeasonShort();
[GeneratedRegex(@"\b(tv|complete)\b", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex TvOrComplete();
[GeneratedRegex(@"\b(saison|stage).?\d", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex SeasonStage();
[GeneratedRegex(@"[a-z]\s?\-\s?\d{2,4}\b", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex Season();
[GeneratedRegex(@"\d{2,4}\s?\-\s?\d{2,4}\b", RegexOptions.IgnoreCase, "en-GB")]
private static partial Regex SeasonTwo();
public static ParsedFilename Parse(string name)
{
VideoCodecsParser.Parse(name, out var videoCodec, out _);
AudioCodecsParser.Parse(name, out var audioCodec, out _);
@@ -28,14 +41,21 @@ public static class TorrentTitleParser
Multi = multi,
Complete = complete,
};
var isTv = GetTypeByName(name) == TorrentType.Tv;
return !isTv ? ParseMovie(name, baseParsed) : ParseSeason(name, baseParsed);
}
private static ParsedFilename ParseSeason(string name, BaseParsed baseParsed)
{
var season = SeasonParser.Parse(name);
if (season == null)
{
return new();
}
return new()
{
Show = new()
@@ -69,7 +89,7 @@ public static class TorrentTitleParser
private static ParsedFilename ParseMovie(string name, BaseParsed baseParsed)
{
TitleParser.Parse(name, out var title, out var year);
baseParsed.Title = title;
baseParsed.Year = year;
@@ -93,4 +113,27 @@ public static class TorrentTitleParser
},
};
}
}
private static TorrentType GetTypeByName(string name)
{
var tvRegexes = new[]
{
SeasonEpisode,
SeasonShort,
TvOrComplete,
SeasonStage,
Season,
SeasonTwo
};
foreach (var regex in tvRegexes)
{
if (regex().IsMatch(name))
{
return TorrentType.Tv;
}
}
return TorrentType.Movie;
}
}

View File

@@ -0,0 +1,7 @@
namespace Producer.Features.ParseTorrentTitle;
public enum TorrentType
{
Movie,
Tv,
}

View File

@@ -1,14 +0,0 @@
namespace Producer.Features.Text;
public class AdultContentConfiguration
{
public const string SectionName = "AdultContentSettings";
public const string Filename = "adultcontent.json";
public bool Allow { get; set; }
public List<string> Keywords { get; set; } = [];
public List<string> CompoundKeywords { get; set; } = [];
public int Threshold { get; set; }
}

View File

@@ -1,13 +0,0 @@
namespace Producer.Features.Text;
public class FuzzyStringSearcher(IEnumerable<string> records, SearchOptions<string>? options = null) : IFuzzySearcher<string>
{
private readonly IReadOnlyCollection<string> _records = records.ToList();
private readonly SearchOptions<string> _options = options ?? new SearchOptions<string>();
public IReadOnlyCollection<ExtractedResult<string>> Search(string text)
{
var dynamicThreshold = (int) Math.Ceiling(text.Length * (_options.Threshold / 100.0));
return Process.ExtractSorted(text, _records, cutoff: dynamicThreshold).ToList();
}
}

View File

@@ -1,6 +0,0 @@
namespace Producer.Features.Text;
public interface IFuzzySearcher<T>
{
IReadOnlyCollection<ExtractedResult<T>> Search(string text);
}

View File

@@ -1,7 +0,0 @@
namespace Producer.Features.Text;
public class SearchOptions<T>
{
public int Threshold { get; init; } = 60;
}

View File

@@ -1,16 +0,0 @@
namespace Producer.Features.Text;
public class SearchResultRecords
{
public record struct ScoreInfo(int Errors, int CurrentLocation, int ExpectedLocation, int Distance,
bool IgnoreLocation);
public record struct SearchResult(bool IsMatch, double Score);
public record struct Index(List<Chunk> Chunks, string Pattern);
public record struct Chunk(int StartIndex, string Pattern, Dictionary<char, int> Alphabet);
public record struct SearchResult<T>(T Value, double Score);
}

View File

@@ -1,26 +0,0 @@
namespace Producer.Features.Text;
public static class ServiceCollectionExtensions
{
public static IServiceCollection RegisterAdultKeywordFilter(this IServiceCollection services, IConfiguration configuration)
{
var adultConfigSettings =
services.LoadConfigurationFromConfig<AdultContentConfiguration>(configuration, AdultContentConfiguration.SectionName);
if (adultConfigSettings.Allow)
{
return services;
}
return services.AddSingleton<IFuzzySearcher<string>>(
_ =>
{
var options = new SearchOptions<string>
{
Threshold = adultConfigSettings.Threshold,
};
return new FuzzyStringSearcher(adultConfigSettings.Keywords, options);
});
}
}

View File

@@ -0,0 +1,12 @@
namespace Producer.Features.Wordlists;
public interface IWordCollections
{
HashSet<string> AdultWords { get; }
HashSet<string> AdultCompoundPhrases { get; }
HashSet<string> CommonWords { get; }
Task LoadAsync();
}

View File

@@ -0,0 +1,19 @@
namespace Producer.Features.Wordlists;
public class PopulationService(IWordCollections wordCollections, ILogger<PopulationService> logger) : IHostedService
{
public async Task StartAsync(CancellationToken cancellationToken)
{
logger.LogInformation("Loading word collections...");
await wordCollections.LoadAsync();
logger.LogInformation("Common Words Count: {Count}", wordCollections.CommonWords.Count);
logger.LogInformation("Adult Words Count: {Count}", wordCollections.AdultWords.Count);
logger.LogInformation("Adult Compound Phrases Count: {Count}", wordCollections.AdultCompoundPhrases.Count);
logger.LogInformation("Word collections loaded.");
}
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}

View File

@@ -0,0 +1,12 @@
namespace Producer.Features.Wordlists;
public static class ServiceCollectionExtensions
{
public static IServiceCollection RegisterWordCollections(this IServiceCollection services)
{
services.AddSingleton<IWordCollections, WordCollections>();
services.AddHostedService<PopulationService>();
return services;
}
}

View File

@@ -0,0 +1,44 @@
namespace Producer.Features.Wordlists;
public class WordCollections : IWordCollections
{
private const string AdultWordsFile = "adult-words.txt";
private const string AdultCompoundPhrasesFile = "adult-compound-words.txt";
private const string CommonWordsFile = "common-words.txt";
public HashSet<string> AdultWords { get; private set; } = [];
public HashSet<string> AdultCompoundPhrases { get; private set; } = [];
public HashSet<string> CommonWords { get; private set; } = [];
public async Task LoadAsync()
{
var loaderTasks = new List<Task>
{
LoadAdultWords(),
LoadAdultCompounds(),
LoadCommonWords()
};
await Task.WhenAll(loaderTasks);
}
private async Task LoadCommonWords()
{
var commonWords = await File.ReadAllLinesAsync(GetPath(CommonWordsFile));
CommonWords = [..commonWords];
}
private async Task LoadAdultCompounds()
{
var adultCompoundWords = await File.ReadAllLinesAsync(GetPath(AdultCompoundPhrasesFile));
AdultCompoundPhrases = [..adultCompoundWords];
}
private async Task LoadAdultWords()
{
var adultWords = await File.ReadAllLinesAsync(GetPath(AdultWordsFile));
AdultWords = [..adultWords];
}
private static string GetPath(string fileName) => Path.Combine(AppContext.BaseDirectory, "Data", fileName);
}

View File

@@ -31,5 +31,5 @@ global using Producer.Features.CrawlerSupport;
global using Producer.Features.DataProcessing;
global using Producer.Features.JobSupport;
global using Producer.Features.ParseTorrentTitle;
global using Producer.Features.Text;
global using Producer.Features.Wordlists;
global using Serilog;

View File

@@ -34,4 +34,11 @@
</None>
</ItemGroup>
<ItemGroup>
<Content Remove="Data\**" />
<None Include="Data\**">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@@ -10,7 +10,7 @@ builder.Services
.RegisterMassTransit()
.AddDataStorage()
.AddCrawlers()
.RegisterAdultKeywordFilter(builder.Configuration)
.RegisterWordCollections()
.AddQuartz(builder.Configuration);
var host = builder.Build();