[wip] bridge python and c# and bring in rank torrent name (#177)

* [wip] bridge python and c# and bring in rank torrent name

* Container restores package now

Includes two dev scripts to install the python packages locally for debugging purposes.

* Introduce slightly turned title matching scoring, by making it length aware

this should help with sequels such as Terminator 2, vs Terminator etc

* Version bump

Also fixes postgres healthcheck so that it utilises the user from the stack.env file
This commit is contained in:
iPromKnight
2024-03-28 10:13:50 +00:00
committed by GitHub
parent cc15a69517
commit 4308a0ee71
24 changed files with 318 additions and 112 deletions

View File

@@ -0,0 +1,2 @@
**/python/
.idea/

View File

@@ -6,6 +6,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharedContracts", "..\share
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shared", "shared", "{FF5CA857-51E8-4446-8840-2A1D24ED3952}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "eng", "eng", "{1AE7F597-24C4-4575-B59F-67A625D95C1E}"
ProjectSection(SolutionItems) = preProject
eng\install-python-reqs.ps1 = eng\install-python-reqs.ps1
eng\install-python-reqs.sh = eng\install-python-reqs.sh
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU

View File

@@ -0,0 +1,2 @@
mkdir -p ../src/python
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/

View File

@@ -0,0 +1,4 @@
#!/bin/bash
mkdir -p ../src/python
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/

View File

@@ -0,0 +1,2 @@
**/python/
.idea/

View File

@@ -8,13 +8,21 @@ WORKDIR /src/producer/src
RUN dotnet restore -a $TARGETARCH
RUN dotnet publish -c Release --no-restore -o /src/out -a $TARGETARCH
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine3.19
WORKDIR /app
ENV PYTHONUNBUFFERED=1
RUN apk add --update --no-cache python3=~3.11.8-r0 py3-pip && ln -sf python3 /usr/bin/python
COPY --from=build /src/out .
RUN rm -rf /app/python && mkdir -p /app/python
RUN pip3 install --force-reinstall rank-torrent-name==0.1.6 -t /app/python
RUN addgroup -S producer && adduser -S -G producer producer
USER producer
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD pgrep -f dotnet || exit 1
ENV PYTHONNET_PYDLL=/usr/lib/libpython3.11.so.1.0
ENTRYPOINT ["dotnet", "Producer.dll"]

View File

@@ -1,3 +1,5 @@
using Microsoft.VisualBasic;
namespace Producer.Features.Crawlers.Dmm;
public partial class DebridMediaManagerCrawler(
@@ -5,11 +7,12 @@ public partial class DebridMediaManagerCrawler(
ILogger<DebridMediaManagerCrawler> logger,
IDataStorage storage,
GithubConfiguration githubConfiguration,
IParseTorrentTitle parseTorrentTitle,
IRankTorrentName rankTorrentName,
IDistributedCache cache) : BaseCrawler(logger, storage)
{
[GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
private static partial Regex HashCollectionMatcher();
private LengthAwareRatioScorer _lengthAwareRatioScorer = new();
private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
@@ -107,74 +110,65 @@ public partial class DebridMediaManagerCrawler(
{
return null;
}
var parsedTorrent = parseTorrentTitle.Parse(torrentTitle.CleanTorrentTitleForImdb());
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.Title);
var parsedTorrent = rankTorrentName.Parse(torrentTitle.CleanTorrentTitleForImdb());
if (cached)
{
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.Title);
return new()
{
Source = Source,
Name = cachedResult.Title,
Imdb = cachedResult.ImdbId,
Size = bytesElement.GetInt64().ToString(),
InfoHash = hashElement.ToString(),
Seeders = 0,
Leechers = 0,
Category = parsedTorrent.TorrentType switch
{
TorrentType.Movie => "movies",
TorrentType.Tv => "tv",
_ => "unknown",
},
};
}
var imdbEntry = await Storage.FindImdbMetadata(parsedTorrent.Title, parsedTorrent.TorrentType, parsedTorrent.Year);
if (imdbEntry.Count == 0)
if (!parsedTorrent.Success)
{
return null;
}
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntry);
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.ParsedTitle);
if (cached)
{
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.ParsedTitle);
return MapToTorrent(cachedResult, bytesElement, hashElement, parsedTorrent);
}
var year = parsedTorrent.Year != 0 ? parsedTorrent.Year.ToString() : null;
var imdbEntries = await Storage.FindImdbMetadata(parsedTorrent.ParsedTitle, parsedTorrent.IsMovie ? "movies" : "tv", year);
if (imdbEntries.Count == 0)
{
return null;
}
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntries);
if (!scoredTitles.Success)
{
return null;
}
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.Title, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.ParsedTitle, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
var torrent = new IngestedTorrent
return MapToTorrent(scoredTitles.BestMatch.Value, bytesElement, hashElement, parsedTorrent);
}
private IngestedTorrent MapToTorrent(ImdbEntry result, JsonElement bytesElement, JsonElement hashElement, ParseTorrentTitleResponse parsedTorrent) =>
new()
{
Source = Source,
Name = scoredTitles.BestMatch.Value.Title,
Imdb = scoredTitles.BestMatch.Value.ImdbId,
Name = result.Title,
Imdb = result.ImdbId,
Size = bytesElement.GetInt64().ToString(),
InfoHash = hashElement.ToString(),
Seeders = 0,
Leechers = 0,
Category = parsedTorrent.TorrentType switch
Category = parsedTorrent.IsMovie switch
{
TorrentType.Movie => "movies",
TorrentType.Tv => "tv",
_ => "unknown",
true => "movies",
false => "tv",
},
};
return torrent;
}
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(TorrentMetadata parsedTorrent, List<ImdbEntry> imdbEntries)
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(ParseTorrentTitleResponse parsedTorrent, List<ImdbEntry> imdbEntries)
{
var lowerCaseTitle = parsedTorrent.Title.ToLowerInvariant();
var lowerCaseTitle = parsedTorrent.ParsedTitle.ToLowerInvariant();
// Scoring directly operates on the List<ImdbEntry>, no need for lookup table.
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: new DefaultRatioScorer(), cutoff: 90);
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: _lengthAwareRatioScorer, cutoff: 90);
var best = scoredResults.MaxBy(x => x.Score);
@@ -192,7 +186,7 @@ public partial class DebridMediaManagerCrawler(
{
var cacheOptions = new DistributedCacheEntryOptions
{
AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(15),
AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1),
};
return cache.SetStringAsync(lowerCaseTitle, JsonSerializer.Serialize(best.Value), cacheOptions);

View File

@@ -0,0 +1,24 @@
namespace Producer.Features.DataProcessing
{
public class LengthAwareRatioScorer : IRatioScorer
{
private readonly IRatioScorer _defaultScorer = new DefaultRatioScorer();
public int Score(string input1, string input2)
{
var score = _defaultScorer.Score(input1, input2);
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
var result = (int)(score * lengthRatio);
return result > 100 ? 100 : result;
}
public int Score(string input1, string input2, PreprocessMode preprocessMode)
{
var score = _defaultScorer.Score(input1, input2, preprocessMode);
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
var result = (int)(score * lengthRatio);
return result > 100 ? 100 : result;
}
}
}

View File

@@ -9,7 +9,8 @@ internal static class ServiceCollectionExtensions
services.AddTransient<IDataStorage, DapperDataStorage>();
services.AddTransient<IMessagePublisher, TorrentPublisher>();
services.AddSingleton<IParseTorrentTitle, ParseTorrentTitle>();
services.RegisterPythonEngine();
services.AddSingleton<IRankTorrentName, RankTorrentName>();
services.AddStackExchangeRedisCache(options =>
{
options.Configuration = redisConfiguration.ConnectionString;

View File

@@ -7,6 +7,8 @@ global using System.Text.RegularExpressions;
global using System.Xml.Linq;
global using FuzzySharp;
global using FuzzySharp.Extractor;
global using FuzzySharp.PreProcess;
global using FuzzySharp.SimilarityRatio.Scorer;
global using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive;
global using LZStringCSharp;
global using MassTransit;
@@ -23,11 +25,10 @@ global using Producer.Features.Crawlers.Torrentio;
global using Producer.Features.CrawlerSupport;
global using Producer.Features.DataProcessing;
global using Producer.Features.JobSupport;
global using PromKnight.ParseTorrentTitle;
global using Serilog;
global using SharedContracts.Configuration;
global using SharedContracts.Dapper;
global using SharedContracts.Extensions;
global using SharedContracts.Models;
global using SharedContracts.Requests;
global using StackExchange.Redis;
global using SharedContracts.Python;
global using SharedContracts.Python.RTN;
global using SharedContracts.Requests;

View File

@@ -19,6 +19,7 @@
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
<PackageReference Include="Polly" Version="8.3.0" />
<PackageReference Include="pythonnet" Version="3.0.3" />
<PackageReference Include="Quartz.Extensions.DependencyInjection" Version="3.8.0" />
<PackageReference Include="Quartz.Extensions.Hosting" Version="3.8.0" />
<PackageReference Include="Serilog" Version="3.1.1" />
@@ -34,9 +35,9 @@
</None>
</ItemGroup>
<ItemGroup>
<Content Remove="Data\**" />
<None Include="Data\**">
<ItemGroup Condition="'$(Configuration)' == 'Debug'">
<Content Remove="python\**" />
<None Include="python\**">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>