[wip] bridge python and c# and bring in rank torrent name (#177)

* [wip] bridge python and c# and bring in rank torrent name

* Container restores package now

Includes two dev scripts to install the python packages locally for debugging purposes.

* Introduce slightly turned title matching scoring, by making it length aware

this should help with sequels such as Terminator 2, vs Terminator etc

* Version bump

Also fixes postgres healthcheck so that it utilises the user from the stack.env file
This commit is contained in:
iPromKnight
2024-03-28 10:13:50 +00:00
committed by GitHub
parent cc15a69517
commit 4308a0ee71
24 changed files with 318 additions and 112 deletions

View File

@@ -0,0 +1,2 @@
**/python/
.idea/

View File

@@ -6,6 +6,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharedContracts", "..\share
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shared", "shared", "{FF5CA857-51E8-4446-8840-2A1D24ED3952}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "eng", "eng", "{1AE7F597-24C4-4575-B59F-67A625D95C1E}"
ProjectSection(SolutionItems) = preProject
eng\install-python-reqs.ps1 = eng\install-python-reqs.ps1
eng\install-python-reqs.sh = eng\install-python-reqs.sh
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU

View File

@@ -0,0 +1,2 @@
mkdir -p ../src/python
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/

View File

@@ -0,0 +1,4 @@
#!/bin/bash
mkdir -p ../src/python
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/

View File

@@ -0,0 +1,2 @@
**/python/
.idea/

View File

@@ -8,13 +8,21 @@ WORKDIR /src/producer/src
RUN dotnet restore -a $TARGETARCH
RUN dotnet publish -c Release --no-restore -o /src/out -a $TARGETARCH
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine3.19
WORKDIR /app
ENV PYTHONUNBUFFERED=1
RUN apk add --update --no-cache python3=~3.11.8-r0 py3-pip && ln -sf python3 /usr/bin/python
COPY --from=build /src/out .
RUN rm -rf /app/python && mkdir -p /app/python
RUN pip3 install --force-reinstall rank-torrent-name==0.1.6 -t /app/python
RUN addgroup -S producer && adduser -S -G producer producer
USER producer
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD pgrep -f dotnet || exit 1
ENV PYTHONNET_PYDLL=/usr/lib/libpython3.11.so.1.0
ENTRYPOINT ["dotnet", "Producer.dll"]

View File

@@ -1,3 +1,5 @@
using Microsoft.VisualBasic;
namespace Producer.Features.Crawlers.Dmm;
public partial class DebridMediaManagerCrawler(
@@ -5,11 +7,12 @@ public partial class DebridMediaManagerCrawler(
ILogger<DebridMediaManagerCrawler> logger,
IDataStorage storage,
GithubConfiguration githubConfiguration,
IParseTorrentTitle parseTorrentTitle,
IRankTorrentName rankTorrentName,
IDistributedCache cache) : BaseCrawler(logger, storage)
{
[GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
private static partial Regex HashCollectionMatcher();
private LengthAwareRatioScorer _lengthAwareRatioScorer = new();
private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
@@ -107,74 +110,65 @@ public partial class DebridMediaManagerCrawler(
{
return null;
}
var parsedTorrent = parseTorrentTitle.Parse(torrentTitle.CleanTorrentTitleForImdb());
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.Title);
var parsedTorrent = rankTorrentName.Parse(torrentTitle.CleanTorrentTitleForImdb());
if (cached)
{
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.Title);
return new()
{
Source = Source,
Name = cachedResult.Title,
Imdb = cachedResult.ImdbId,
Size = bytesElement.GetInt64().ToString(),
InfoHash = hashElement.ToString(),
Seeders = 0,
Leechers = 0,
Category = parsedTorrent.TorrentType switch
{
TorrentType.Movie => "movies",
TorrentType.Tv => "tv",
_ => "unknown",
},
};
}
var imdbEntry = await Storage.FindImdbMetadata(parsedTorrent.Title, parsedTorrent.TorrentType, parsedTorrent.Year);
if (imdbEntry.Count == 0)
if (!parsedTorrent.Success)
{
return null;
}
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntry);
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.ParsedTitle);
if (cached)
{
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.ParsedTitle);
return MapToTorrent(cachedResult, bytesElement, hashElement, parsedTorrent);
}
var year = parsedTorrent.Year != 0 ? parsedTorrent.Year.ToString() : null;
var imdbEntries = await Storage.FindImdbMetadata(parsedTorrent.ParsedTitle, parsedTorrent.IsMovie ? "movies" : "tv", year);
if (imdbEntries.Count == 0)
{
return null;
}
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntries);
if (!scoredTitles.Success)
{
return null;
}
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.Title, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.ParsedTitle, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
var torrent = new IngestedTorrent
return MapToTorrent(scoredTitles.BestMatch.Value, bytesElement, hashElement, parsedTorrent);
}
private IngestedTorrent MapToTorrent(ImdbEntry result, JsonElement bytesElement, JsonElement hashElement, ParseTorrentTitleResponse parsedTorrent) =>
new()
{
Source = Source,
Name = scoredTitles.BestMatch.Value.Title,
Imdb = scoredTitles.BestMatch.Value.ImdbId,
Name = result.Title,
Imdb = result.ImdbId,
Size = bytesElement.GetInt64().ToString(),
InfoHash = hashElement.ToString(),
Seeders = 0,
Leechers = 0,
Category = parsedTorrent.TorrentType switch
Category = parsedTorrent.IsMovie switch
{
TorrentType.Movie => "movies",
TorrentType.Tv => "tv",
_ => "unknown",
true => "movies",
false => "tv",
},
};
return torrent;
}
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(TorrentMetadata parsedTorrent, List<ImdbEntry> imdbEntries)
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(ParseTorrentTitleResponse parsedTorrent, List<ImdbEntry> imdbEntries)
{
var lowerCaseTitle = parsedTorrent.Title.ToLowerInvariant();
var lowerCaseTitle = parsedTorrent.ParsedTitle.ToLowerInvariant();
// Scoring directly operates on the List<ImdbEntry>, no need for lookup table.
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: new DefaultRatioScorer(), cutoff: 90);
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: _lengthAwareRatioScorer, cutoff: 90);
var best = scoredResults.MaxBy(x => x.Score);
@@ -192,7 +186,7 @@ public partial class DebridMediaManagerCrawler(
{
var cacheOptions = new DistributedCacheEntryOptions
{
AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(15),
AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1),
};
return cache.SetStringAsync(lowerCaseTitle, JsonSerializer.Serialize(best.Value), cacheOptions);

View File

@@ -0,0 +1,24 @@
namespace Producer.Features.DataProcessing
{
public class LengthAwareRatioScorer : IRatioScorer
{
private readonly IRatioScorer _defaultScorer = new DefaultRatioScorer();
public int Score(string input1, string input2)
{
var score = _defaultScorer.Score(input1, input2);
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
var result = (int)(score * lengthRatio);
return result > 100 ? 100 : result;
}
public int Score(string input1, string input2, PreprocessMode preprocessMode)
{
var score = _defaultScorer.Score(input1, input2, preprocessMode);
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
var result = (int)(score * lengthRatio);
return result > 100 ? 100 : result;
}
}
}

View File

@@ -9,7 +9,8 @@ internal static class ServiceCollectionExtensions
services.AddTransient<IDataStorage, DapperDataStorage>();
services.AddTransient<IMessagePublisher, TorrentPublisher>();
services.AddSingleton<IParseTorrentTitle, ParseTorrentTitle>();
services.RegisterPythonEngine();
services.AddSingleton<IRankTorrentName, RankTorrentName>();
services.AddStackExchangeRedisCache(options =>
{
options.Configuration = redisConfiguration.ConnectionString;

View File

@@ -7,6 +7,8 @@ global using System.Text.RegularExpressions;
global using System.Xml.Linq;
global using FuzzySharp;
global using FuzzySharp.Extractor;
global using FuzzySharp.PreProcess;
global using FuzzySharp.SimilarityRatio.Scorer;
global using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive;
global using LZStringCSharp;
global using MassTransit;
@@ -23,11 +25,10 @@ global using Producer.Features.Crawlers.Torrentio;
global using Producer.Features.CrawlerSupport;
global using Producer.Features.DataProcessing;
global using Producer.Features.JobSupport;
global using PromKnight.ParseTorrentTitle;
global using Serilog;
global using SharedContracts.Configuration;
global using SharedContracts.Dapper;
global using SharedContracts.Extensions;
global using SharedContracts.Models;
global using SharedContracts.Requests;
global using StackExchange.Redis;
global using SharedContracts.Python;
global using SharedContracts.Python.RTN;
global using SharedContracts.Requests;

View File

@@ -19,6 +19,7 @@
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
<PackageReference Include="Polly" Version="8.3.0" />
<PackageReference Include="pythonnet" Version="3.0.3" />
<PackageReference Include="Quartz.Extensions.DependencyInjection" Version="3.8.0" />
<PackageReference Include="Quartz.Extensions.Hosting" Version="3.8.0" />
<PackageReference Include="Serilog" Version="3.1.1" />
@@ -34,9 +35,9 @@
</None>
</ItemGroup>
<ItemGroup>
<Content Remove="Data\**" />
<None Include="Data\**">
<ItemGroup Condition="'$(Configuration)' == 'Debug'">
<Content Remove="python\**" />
<None Include="python\**">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>

View File

@@ -115,10 +115,10 @@ public class DapperDataStorage(PostgresConfiguration configuration, RabbitMqConf
return result.ToList();
}, "Error getting imdb metadata.", cancellationToken);
public async Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, TorrentType torrentType, string? year, CancellationToken cancellationToken = default) =>
public async Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, string torrentType, string? year, CancellationToken cancellationToken = default) =>
await ExecuteCommandAsync(async connection =>
{
var query = $"select \"imdb_id\" as \"ImdbId\", \"title\" as \"Title\", \"year\" as \"Year\" from search_imdb_meta('{parsedTorrentTitle.Replace("'", "").Replace("\"", "")}', '{(torrentType == TorrentType.Movie ? "movie" : "tvSeries")}'";
var query = $"select \"imdb_id\" as \"ImdbId\", \"title\" as \"Title\", \"year\" as \"Year\" from search_imdb_meta('{parsedTorrentTitle.Replace("'", "").Replace("\"", "")}', '{(torrentType.Equals("movie", StringComparison.OrdinalIgnoreCase) ? "movie" : "tvSeries")}'";
query += year is not null ? $", '{year}'" : ", NULL";
query += ", 15)";

View File

@@ -9,7 +9,7 @@ public interface IDataStorage
Task<DapperResult<PageIngestedResult, PageIngestedResult>> MarkPageAsIngested(string pageId, CancellationToken cancellationToken = default);
Task<DapperResult<int, int>> GetRowCountImdbMetadata(CancellationToken cancellationToken = default);
Task<List<ImdbEntry>> GetImdbEntriesForRequests(int year, int batchSize, string? stateLastProcessedImdbId, CancellationToken cancellationToken = default);
Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, TorrentType parsedTorrentTorrentType, string? parsedTorrentYear, CancellationToken cancellationToken = default);
Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, string parsedTorrentTorrentType, string? parsedTorrentYear, CancellationToken cancellationToken = default);
Task InsertTorrent(Torrent torrent, CancellationToken cancellationToken = default);
Task InsertFiles(IEnumerable<TorrentFile> files, CancellationToken cancellationToken = default);
Task InsertSubtitles(IEnumerable<SubtitleFile> subtitles, CancellationToken cancellationToken = default);

View File

@@ -1,4 +1,3 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
namespace SharedContracts.Extensions;

View File

@@ -6,10 +6,11 @@ global using MassTransit;
global using Microsoft.AspNetCore.Builder;
global using Microsoft.AspNetCore.Hosting;
global using Microsoft.Extensions.Configuration;
global using Microsoft.Extensions.DependencyInjection;
global using Microsoft.Extensions.Hosting;
global using Microsoft.Extensions.Logging;
global using Npgsql;
global using PromKnight.ParseTorrentTitle;
global using Python.Runtime;
global using Serilog;
global using SharedContracts.Configuration;
global using SharedContracts.Extensions;

View File

@@ -0,0 +1,49 @@
namespace SharedContracts.Python;
public class PythonEngineService(ILogger<PythonEngineService> logger) : IHostedService
{
private IntPtr _mainThreadState;
private bool _isInitialized;
public Task StartAsync(CancellationToken cancellationToken)
{
if (_isInitialized)
{
return Task.CompletedTask;
}
try
{
var pythonDllEnv = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL");
if (string.IsNullOrWhiteSpace(pythonDllEnv))
{
logger.LogWarning("PYTHONNET_PYDLL env is not set. Exiting Application");
Environment.Exit(1);
return Task.CompletedTask;
}
Runtime.PythonDLL = pythonDllEnv;
PythonEngine.Initialize();
_mainThreadState = PythonEngine.BeginAllowThreads();
_isInitialized = true;
logger.LogInformation("Python engine initialized");
}
catch (Exception e)
{
logger.LogWarning(e, "Failed to initialize Python engine");
Environment.Exit(1);
}
return Task.CompletedTask;
}
public Task StopAsync(CancellationToken cancellationToken)
{
PythonEngine.EndAllowThreads(_mainThreadState);
PythonEngine.Shutdown();
return Task.CompletedTask;
}
}

View File

@@ -0,0 +1,8 @@
namespace SharedContracts.Python.RTN;
public interface IRankTorrentName
{
ParseTorrentTitleResponse Parse(string title);
bool IsTrash(string title);
bool TitleMatch(string title, string checkTitle);
}

View File

@@ -0,0 +1,6 @@
namespace SharedContracts.Python.RTN;
public record ParseTorrentTitleResponse(bool Success, string ParsedTitle, int Year, int[]? Season = null, int[]? Episode = null)
{
public bool IsMovie => Season == null && Episode == null;
}

View File

@@ -0,0 +1,118 @@
namespace SharedContracts.Python.RTN;
public class RankTorrentName : IRankTorrentName
{
private const string SysModuleName = "sys";
private const string RtnModuleName = "RTN";
private readonly ILogger<RankTorrentName> _logger;
private dynamic? _sys;
private dynamic? _rtn;
public RankTorrentName(ILogger<RankTorrentName> logger)
{
_logger = logger;
InitModules();
}
public ParseTorrentTitleResponse Parse(string title)
{
try
{
using var py = Py.GIL();
var result = _rtn?.parse(title);
if (result == null)
{
return new(false, string.Empty, 0);
}
return ParseResult(result);
}
catch (Exception e)
{
_logger.LogError(e, "Failed to parse title");
return new(false, string.Empty, 0);
}
}
public bool IsTrash(string title)
{
try
{
using var py = Py.GIL();
var result = _rtn?.check_trash(title);
if (result == null)
{
return false;
}
var response = result.As<bool>() ?? false;
return response;
}
catch (Exception e)
{
_logger.LogError(e, "Failed to parse title");
return false;
}
}
public bool TitleMatch(string title, string checkTitle)
{
try
{
using var py = Py.GIL();
var result = _rtn?.title_match(title, checkTitle);
if (result == null)
{
return false;
}
var response = result.As<bool>() ?? false;
return response;
}
catch (Exception e)
{
_logger.LogError(e, "Failed to parse title");
return false;
}
}
private static ParseTorrentTitleResponse ParseResult(dynamic result)
{
var parsedTitle = result.GetAttr("parsed_title")?.As<string>() ?? string.Empty;
var year = result.GetAttr("year")?.As<int>() ?? 0;
var seasonList = result.GetAttr("season")?.As<PyList>();
var episodeList = result.GetAttr("episode")?.As<PyList>();
int[]? seasons = seasonList?.Length() > 0 ? seasonList.As<int[]>() : null;
int[]? episodes = episodeList?.Length() > 0 ? episodeList.As<int[]>() : null;
return new ParseTorrentTitleResponse(true, parsedTitle, year, seasons, episodes);
}
private void InitModules()
{
using var py = Py.GIL();
_sys = Py.Import(SysModuleName);
if (_sys == null)
{
_logger.LogError($"Failed to import Python module: {SysModuleName}");
return;
}
_sys.path.append(Path.Combine(AppContext.BaseDirectory, "python"));
_rtn = Py.Import(RtnModuleName);
if (_rtn == null)
{
_logger.LogError($"Failed to import Python module: {RtnModuleName}");
}
}
}

View File

@@ -0,0 +1,13 @@
namespace SharedContracts.Python;
public static class ServiceCollectionExtensions
{
public static IServiceCollection RegisterPythonEngine(this IServiceCollection services)
{
services.AddSingleton<PythonEngineService>();
services.AddHostedService(p => p.GetRequiredService<PythonEngineService>());
return services;
}
}

View File

@@ -16,7 +16,7 @@
<PackageReference Include="MassTransit.Abstractions" Version="8.2.0" />
<PackageReference Include="MassTransit.RabbitMQ" Version="8.2.0" />
<PackageReference Include="Npgsql" Version="8.0.2" />
<PackageReference Include="PromKnight.ParseTorrentTitle" Version="1.0.4" />
<PackageReference Include="pythonnet" Version="3.0.3" />
<PackageReference Include="Serilog" Version="3.1.1" />
<PackageReference Include="Serilog.Extensions.Hosting" Version="8.0.0" />
<PackageReference Include="Serilog.Settings.Configuration" Version="8.0.0" />