mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
[wip] bridge python and c# and bring in rank torrent name (#177)
* [wip] bridge python and c# and bring in rank torrent name * Container restores package now Includes two dev scripts to install the python packages locally for debugging purposes. * Introduce slightly turned title matching scoring, by making it length aware this should help with sequels such as Terminator 2, vs Terminator etc * Version bump Also fixes postgres healthcheck so that it utilises the user from the stack.env file
This commit is contained in:
2
src/producer/.dockerignore
Normal file
2
src/producer/.dockerignore
Normal file
@@ -0,0 +1,2 @@
|
||||
**/python/
|
||||
.idea/
|
||||
@@ -6,6 +6,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharedContracts", "..\share
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shared", "shared", "{FF5CA857-51E8-4446-8840-2A1D24ED3952}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "eng", "eng", "{1AE7F597-24C4-4575-B59F-67A625D95C1E}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
eng\install-python-reqs.ps1 = eng\install-python-reqs.ps1
|
||||
eng\install-python-reqs.sh = eng\install-python-reqs.sh
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
|
||||
2
src/producer/eng/install-python-reqs.ps1
Normal file
2
src/producer/eng/install-python-reqs.ps1
Normal file
@@ -0,0 +1,2 @@
|
||||
mkdir -p ../src/python
|
||||
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/
|
||||
4
src/producer/eng/install-python-reqs.sh
Normal file
4
src/producer/eng/install-python-reqs.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
mkdir -p ../src/python
|
||||
pip install --force-reinstall rank-torrent-name==0.1.6 -t ../src/python/
|
||||
2
src/producer/src/.dockerignore
Normal file
2
src/producer/src/.dockerignore
Normal file
@@ -0,0 +1,2 @@
|
||||
**/python/
|
||||
.idea/
|
||||
@@ -8,13 +8,21 @@ WORKDIR /src/producer/src
|
||||
RUN dotnet restore -a $TARGETARCH
|
||||
RUN dotnet publish -c Release --no-restore -o /src/out -a $TARGETARCH
|
||||
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine3.19
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
RUN apk add --update --no-cache python3=~3.11.8-r0 py3-pip && ln -sf python3 /usr/bin/python
|
||||
|
||||
COPY --from=build /src/out .
|
||||
RUN rm -rf /app/python && mkdir -p /app/python
|
||||
RUN pip3 install --force-reinstall rank-torrent-name==0.1.6 -t /app/python
|
||||
RUN addgroup -S producer && adduser -S -G producer producer
|
||||
USER producer
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD pgrep -f dotnet || exit 1
|
||||
|
||||
ENV PYTHONNET_PYDLL=/usr/lib/libpython3.11.so.1.0
|
||||
|
||||
ENTRYPOINT ["dotnet", "Producer.dll"]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using Microsoft.VisualBasic;
|
||||
|
||||
namespace Producer.Features.Crawlers.Dmm;
|
||||
|
||||
public partial class DebridMediaManagerCrawler(
|
||||
@@ -5,11 +7,12 @@ public partial class DebridMediaManagerCrawler(
|
||||
ILogger<DebridMediaManagerCrawler> logger,
|
||||
IDataStorage storage,
|
||||
GithubConfiguration githubConfiguration,
|
||||
IParseTorrentTitle parseTorrentTitle,
|
||||
IRankTorrentName rankTorrentName,
|
||||
IDistributedCache cache) : BaseCrawler(logger, storage)
|
||||
{
|
||||
[GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
|
||||
private static partial Regex HashCollectionMatcher();
|
||||
private LengthAwareRatioScorer _lengthAwareRatioScorer = new();
|
||||
|
||||
private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
|
||||
protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
|
||||
@@ -107,74 +110,65 @@ public partial class DebridMediaManagerCrawler(
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var parsedTorrent = parseTorrentTitle.Parse(torrentTitle.CleanTorrentTitleForImdb());
|
||||
|
||||
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.Title);
|
||||
var parsedTorrent = rankTorrentName.Parse(torrentTitle.CleanTorrentTitleForImdb());
|
||||
|
||||
if (cached)
|
||||
{
|
||||
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.Title);
|
||||
return new()
|
||||
{
|
||||
Source = Source,
|
||||
Name = cachedResult.Title,
|
||||
Imdb = cachedResult.ImdbId,
|
||||
Size = bytesElement.GetInt64().ToString(),
|
||||
InfoHash = hashElement.ToString(),
|
||||
Seeders = 0,
|
||||
Leechers = 0,
|
||||
Category = parsedTorrent.TorrentType switch
|
||||
{
|
||||
TorrentType.Movie => "movies",
|
||||
TorrentType.Tv => "tv",
|
||||
_ => "unknown",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
var imdbEntry = await Storage.FindImdbMetadata(parsedTorrent.Title, parsedTorrent.TorrentType, parsedTorrent.Year);
|
||||
|
||||
if (imdbEntry.Count == 0)
|
||||
if (!parsedTorrent.Success)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntry);
|
||||
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.ParsedTitle);
|
||||
|
||||
if (cached)
|
||||
{
|
||||
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.ParsedTitle);
|
||||
return MapToTorrent(cachedResult, bytesElement, hashElement, parsedTorrent);
|
||||
}
|
||||
|
||||
var year = parsedTorrent.Year != 0 ? parsedTorrent.Year.ToString() : null;
|
||||
var imdbEntries = await Storage.FindImdbMetadata(parsedTorrent.ParsedTitle, parsedTorrent.IsMovie ? "movies" : "tv", year);
|
||||
|
||||
if (imdbEntries.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntries);
|
||||
|
||||
if (!scoredTitles.Success)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.Title, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
|
||||
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.ParsedTitle, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
|
||||
|
||||
var torrent = new IngestedTorrent
|
||||
return MapToTorrent(scoredTitles.BestMatch.Value, bytesElement, hashElement, parsedTorrent);
|
||||
}
|
||||
|
||||
private IngestedTorrent MapToTorrent(ImdbEntry result, JsonElement bytesElement, JsonElement hashElement, ParseTorrentTitleResponse parsedTorrent) =>
|
||||
new()
|
||||
{
|
||||
Source = Source,
|
||||
Name = scoredTitles.BestMatch.Value.Title,
|
||||
Imdb = scoredTitles.BestMatch.Value.ImdbId,
|
||||
Name = result.Title,
|
||||
Imdb = result.ImdbId,
|
||||
Size = bytesElement.GetInt64().ToString(),
|
||||
InfoHash = hashElement.ToString(),
|
||||
Seeders = 0,
|
||||
Leechers = 0,
|
||||
Category = parsedTorrent.TorrentType switch
|
||||
Category = parsedTorrent.IsMovie switch
|
||||
{
|
||||
TorrentType.Movie => "movies",
|
||||
TorrentType.Tv => "tv",
|
||||
_ => "unknown",
|
||||
true => "movies",
|
||||
false => "tv",
|
||||
},
|
||||
};
|
||||
|
||||
return torrent;
|
||||
}
|
||||
|
||||
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(TorrentMetadata parsedTorrent, List<ImdbEntry> imdbEntries)
|
||||
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(ParseTorrentTitleResponse parsedTorrent, List<ImdbEntry> imdbEntries)
|
||||
{
|
||||
var lowerCaseTitle = parsedTorrent.Title.ToLowerInvariant();
|
||||
var lowerCaseTitle = parsedTorrent.ParsedTitle.ToLowerInvariant();
|
||||
|
||||
// Scoring directly operates on the List<ImdbEntry>, no need for lookup table.
|
||||
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: new DefaultRatioScorer(), cutoff: 90);
|
||||
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: _lengthAwareRatioScorer, cutoff: 90);
|
||||
|
||||
var best = scoredResults.MaxBy(x => x.Score);
|
||||
|
||||
@@ -192,7 +186,7 @@ public partial class DebridMediaManagerCrawler(
|
||||
{
|
||||
var cacheOptions = new DistributedCacheEntryOptions
|
||||
{
|
||||
AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(15),
|
||||
AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1),
|
||||
};
|
||||
|
||||
return cache.SetStringAsync(lowerCaseTitle, JsonSerializer.Serialize(best.Value), cacheOptions);
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
namespace Producer.Features.DataProcessing
|
||||
{
|
||||
public class LengthAwareRatioScorer : IRatioScorer
|
||||
{
|
||||
private readonly IRatioScorer _defaultScorer = new DefaultRatioScorer();
|
||||
|
||||
public int Score(string input1, string input2)
|
||||
{
|
||||
var score = _defaultScorer.Score(input1, input2);
|
||||
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
|
||||
var result = (int)(score * lengthRatio);
|
||||
return result > 100 ? 100 : result;
|
||||
}
|
||||
|
||||
public int Score(string input1, string input2, PreprocessMode preprocessMode)
|
||||
{
|
||||
var score = _defaultScorer.Score(input1, input2, preprocessMode);
|
||||
var lengthRatio = (double)Math.Min(input1.Length, input2.Length) / Math.Max(input1.Length, input2.Length);
|
||||
var result = (int)(score * lengthRatio);
|
||||
|
||||
return result > 100 ? 100 : result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,8 @@ internal static class ServiceCollectionExtensions
|
||||
|
||||
services.AddTransient<IDataStorage, DapperDataStorage>();
|
||||
services.AddTransient<IMessagePublisher, TorrentPublisher>();
|
||||
services.AddSingleton<IParseTorrentTitle, ParseTorrentTitle>();
|
||||
services.RegisterPythonEngine();
|
||||
services.AddSingleton<IRankTorrentName, RankTorrentName>();
|
||||
services.AddStackExchangeRedisCache(options =>
|
||||
{
|
||||
options.Configuration = redisConfiguration.ConnectionString;
|
||||
|
||||
@@ -7,6 +7,8 @@ global using System.Text.RegularExpressions;
|
||||
global using System.Xml.Linq;
|
||||
global using FuzzySharp;
|
||||
global using FuzzySharp.Extractor;
|
||||
global using FuzzySharp.PreProcess;
|
||||
global using FuzzySharp.SimilarityRatio.Scorer;
|
||||
global using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive;
|
||||
global using LZStringCSharp;
|
||||
global using MassTransit;
|
||||
@@ -23,11 +25,10 @@ global using Producer.Features.Crawlers.Torrentio;
|
||||
global using Producer.Features.CrawlerSupport;
|
||||
global using Producer.Features.DataProcessing;
|
||||
global using Producer.Features.JobSupport;
|
||||
global using PromKnight.ParseTorrentTitle;
|
||||
global using Serilog;
|
||||
global using SharedContracts.Configuration;
|
||||
global using SharedContracts.Dapper;
|
||||
global using SharedContracts.Extensions;
|
||||
global using SharedContracts.Models;
|
||||
global using SharedContracts.Requests;
|
||||
global using StackExchange.Redis;
|
||||
global using SharedContracts.Python;
|
||||
global using SharedContracts.Python.RTN;
|
||||
global using SharedContracts.Requests;
|
||||
@@ -19,6 +19,7 @@
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.0" />
|
||||
<PackageReference Include="Polly" Version="8.3.0" />
|
||||
<PackageReference Include="pythonnet" Version="3.0.3" />
|
||||
<PackageReference Include="Quartz.Extensions.DependencyInjection" Version="3.8.0" />
|
||||
<PackageReference Include="Quartz.Extensions.Hosting" Version="3.8.0" />
|
||||
<PackageReference Include="Serilog" Version="3.1.1" />
|
||||
@@ -34,9 +35,9 @@
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Remove="Data\**" />
|
||||
<None Include="Data\**">
|
||||
<ItemGroup Condition="'$(Configuration)' == 'Debug'">
|
||||
<Content Remove="python\**" />
|
||||
<None Include="python\**">
|
||||
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
Reference in New Issue
Block a user