trigram performance increased and housekeeping (#184)

* add new indexes, and change year column to int

* Change gist to gin, and change year to int

* Producer changes for new gin query

* Fully map the rtn response using json dump from Pydantic

Also updates Rtn to 0.1.9

* Add housekeeping script to reconcile imdb ids.

* Join Torrent onto the ingested torrent table

Ensure that a torrent can always find the details of where it came from, and how it was parsed.

* Version bump for release

* missing quote on table name
This commit is contained in:
iPromKnight
2024-03-29 19:01:48 +00:00
committed by GitHub
parent 2d78dc2735
commit 66609c2a46
23 changed files with 303 additions and 102 deletions

View File

@@ -1,5 +1,3 @@
using Microsoft.VisualBasic;
namespace Producer.Features.Crawlers.Dmm;
public partial class DebridMediaManagerCrawler(
@@ -12,7 +10,6 @@ public partial class DebridMediaManagerCrawler(
{
[GeneratedRegex("""<iframe src="https:\/\/debridmediamanager.com\/hashlist#(.*)"></iframe>""")]
private static partial Regex HashCollectionMatcher();
private LengthAwareRatioScorer _lengthAwareRatioScorer = new();
private const string DownloadBaseUrl = "https://raw.githubusercontent.com/debridmediamanager/hashlists/main";
protected override IReadOnlyDictionary<string, string> Mappings => new Dictionary<string, string>();
@@ -118,32 +115,27 @@ public partial class DebridMediaManagerCrawler(
return null;
}
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.ParsedTitle);
var (cached, cachedResult) = await CheckIfInCacheAndReturn(parsedTorrent.Response.ParsedTitle);
if (cached)
{
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.ParsedTitle);
logger.LogInformation("[{ImdbId}] Found cached imdb result for {Title}", cachedResult.ImdbId, parsedTorrent.Response.ParsedTitle);
return MapToTorrent(cachedResult, bytesElement, hashElement, parsedTorrent);
}
var year = parsedTorrent.Year != 0 ? parsedTorrent.Year.ToString() : null;
var imdbEntries = await Storage.FindImdbMetadata(parsedTorrent.ParsedTitle, parsedTorrent.IsMovie ? "movies" : "tv", year);
int? year = parsedTorrent.Response.Year != 0 ? parsedTorrent.Response.Year : null;
var imdbEntry = await Storage.FindImdbMetadata(parsedTorrent.Response.ParsedTitle, parsedTorrent.Response.IsMovie ? "movies" : "tv", year);
if (imdbEntries.Count == 0)
if (imdbEntry is null)
{
return null;
}
var scoredTitles = await ScoreTitles(parsedTorrent, imdbEntries);
await AddToCache(parsedTorrent.Response.ParsedTitle.ToLowerInvariant(), imdbEntry);
if (!scoredTitles.Success)
{
return null;
}
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", scoredTitles.BestMatch.Value.ImdbId, parsedTorrent.ParsedTitle, scoredTitles.BestMatch.Value.Title, scoredTitles.BestMatch.Score);
logger.LogInformation("[{ImdbId}] Found best match for {Title}: {BestMatch} with score {Score}", imdbEntry.ImdbId, parsedTorrent.Response.ParsedTitle, imdbEntry.Title, imdbEntry.Score);
return MapToTorrent(scoredTitles.BestMatch.Value, bytesElement, hashElement, parsedTorrent);
return MapToTorrent(imdbEntry, bytesElement, hashElement, parsedTorrent);
}
private IngestedTorrent MapToTorrent(ImdbEntry result, JsonElement bytesElement, JsonElement hashElement, ParseTorrentTitleResponse parsedTorrent) =>
@@ -156,40 +148,22 @@ public partial class DebridMediaManagerCrawler(
InfoHash = hashElement.ToString(),
Seeders = 0,
Leechers = 0,
Category = parsedTorrent.IsMovie switch
Category = parsedTorrent.Response.IsMovie switch
{
true => "movies",
false => "tv",
},
RtnResponse = parsedTorrent.Response.ToJson(),
};
private async Task<(bool Success, ExtractedResult<ImdbEntry>? BestMatch)> ScoreTitles(ParseTorrentTitleResponse parsedTorrent, List<ImdbEntry> imdbEntries)
{
var lowerCaseTitle = parsedTorrent.ParsedTitle.ToLowerInvariant();
// Scoring directly operates on the List<ImdbEntry>, no need for lookup table.
var scoredResults = Process.ExtractAll(new(){Title = lowerCaseTitle}, imdbEntries, x => x.Title?.ToLowerInvariant(), scorer: _lengthAwareRatioScorer, cutoff: 90);
var best = scoredResults.MaxBy(x => x.Score);
if (best is null)
{
return (false, null);
}
await AddToCache(lowerCaseTitle, best);
return (true, best);
}
private Task AddToCache(string lowerCaseTitle, ExtractedResult<ImdbEntry> best)
private Task AddToCache(string lowerCaseTitle, ImdbEntry best)
{
var cacheOptions = new DistributedCacheEntryOptions
{
AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1),
};
return cache.SetStringAsync(lowerCaseTitle, JsonSerializer.Serialize(best.Value), cacheOptions);
return cache.SetStringAsync(lowerCaseTitle, JsonSerializer.Serialize(best), cacheOptions);
}
private async Task<(bool Success, ImdbEntry? Entry)> CheckIfInCacheAndReturn(string title)

View File

@@ -1 +1 @@
rank-torrent-name==0.1.8
rank-torrent-name==0.1.9