trigram performance increased and housekeeping (#184)

* add new indexes, and change year column to int

* Change gist to gin, and change year to int

* Producer changes for new gin query

* Fully map the rtn response using json dump from Pydantic

Also updates Rtn to 0.1.9

* Add housekeeping script to reconcile imdb ids.

* Join Torrent onto the ingested torrent table

Ensure that a torrent can always find the details of where it came from, and how it was parsed.

* Version bump for release

* missing quote on table name
This commit is contained in:
iPromKnight
2024-03-29 19:01:48 +00:00
committed by GitHub
parent 2d78dc2735
commit 66609c2a46
23 changed files with 303 additions and 102 deletions

View File

@@ -9,9 +9,9 @@ public class DapperDataStorage(PostgresConfiguration configuration, RabbitMqConf
const string query =
"""
INSERT INTO ingested_torrents
("name", "source", "category", "info_hash", "size", "seeders", "leechers", "imdb", "processed", "createdAt", "updatedAt")
("name", "source", "category", "info_hash", "size", "seeders", "leechers", "imdb", "processed", "createdAt", "updatedAt", "rtn_response")
VALUES
(@Name, @Source, @Category, @InfoHash, @Size, @Seeders, @Leechers, @Imdb, @Processed, @CreatedAt, @UpdatedAt)
(@Name, @Source, @Category, @InfoHash, @Size, @Seeders, @Leechers, @Imdb, @Processed, @CreatedAt, @UpdatedAt, @RtnResponse::jsonb)
ON CONFLICT (source, info_hash) DO NOTHING
""";
@@ -110,21 +110,21 @@ public class DapperDataStorage(PostgresConfiguration configuration, RabbitMqConf
public async Task<List<ImdbEntry>> GetImdbEntriesForRequests(int year, int batchSize, string? stateLastProcessedImdbId, CancellationToken cancellationToken = default) =>
await ExecuteCommandAsync(async connection =>
{
const string query = @"SELECT imdb_id AS ImdbId, title as Title, category as Category, year as Year, adult as Adult FROM imdb_metadata WHERE CAST(NULLIF(Year, '\N') AS INTEGER) <= @Year AND imdb_id > @LastProcessedImdbId ORDER BY ImdbId LIMIT @BatchSize";
const string query = @"SELECT imdb_id AS ImdbId, title as Title, category as Category, year as Year, adult as Adult FROM imdb_metadata WHERE Year <= @Year AND imdb_id > @LastProcessedImdbId ORDER BY ImdbId LIMIT @BatchSize";
var result = await connection.QueryAsync<ImdbEntry>(query, new { Year = year, LastProcessedImdbId = stateLastProcessedImdbId, BatchSize = batchSize });
return result.ToList();
}, "Error getting imdb metadata.", cancellationToken);
public async Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, string torrentType, string? year, CancellationToken cancellationToken = default) =>
public async Task<ImdbEntry?> FindImdbMetadata(string? parsedTorrentTitle, string torrentType, int? year, CancellationToken cancellationToken = default) =>
await ExecuteCommandAsync(async connection =>
{
var query = $"select \"imdb_id\" as \"ImdbId\", \"title\" as \"Title\", \"year\" as \"Year\" from search_imdb_meta('{parsedTorrentTitle.Replace("'", "").Replace("\"", "")}', '{(torrentType.Equals("movie", StringComparison.OrdinalIgnoreCase) ? "movie" : "tvSeries")}'";
query += year is not null ? $", '{year}'" : ", NULL";
query += ", 15)";
var query = $"select \"imdb_id\" as \"ImdbId\", \"title\" as \"Title\", \"year\" as \"Year\", \"score\" as Score from search_imdb_meta('{parsedTorrentTitle.Replace("'", "").Replace("\"", "")}', '{(torrentType.Equals("movie", StringComparison.OrdinalIgnoreCase) ? "movie" : "tvSeries")}'";
query += year is not null ? $", {year}" : ", NULL";
query += ", 1)";
var result = await connection.QueryAsync<ImdbEntry>(query);
return result.ToList();
var results = result.ToList();
return results.FirstOrDefault();
}, "Error finding imdb metadata.", cancellationToken);
public Task InsertTorrent(Torrent torrent, CancellationToken cancellationToken = default) =>
@@ -134,9 +134,9 @@ public class DapperDataStorage(PostgresConfiguration configuration, RabbitMqConf
const string query =
"""
INSERT INTO "torrents"
("infoHash", "provider", "torrentId", "title", "size", "type", "uploadDate", "seeders", "trackers", "languages", "resolution", "reviewed", "opened", "createdAt", "updatedAt")
("infoHash", "ingestedTorrentId", "provider", "title", "size", "type", "uploadDate", "seeders", "languages", "resolution", "reviewed", "opened", "createdAt", "updatedAt")
VALUES
(@InfoHash, @Provider, @TorrentId, @Title, 0, @Type, NOW(), @Seeders, NULL, NULL, NULL, false, false, NOW(), NOW())
(@InfoHash, @IngestedTorrentId, @Provider, @Title, 0, @Type, NOW(), @Seeders, NULL, NULL, false, false, NOW(), NOW())
ON CONFLICT ("infoHash") DO NOTHING
""";

View File

@@ -9,7 +9,7 @@ public interface IDataStorage
Task<DapperResult<PageIngestedResult, PageIngestedResult>> MarkPageAsIngested(string pageId, CancellationToken cancellationToken = default);
Task<DapperResult<int, int>> GetRowCountImdbMetadata(CancellationToken cancellationToken = default);
Task<List<ImdbEntry>> GetImdbEntriesForRequests(int year, int batchSize, string? stateLastProcessedImdbId, CancellationToken cancellationToken = default);
Task<List<ImdbEntry>> FindImdbMetadata(string? parsedTorrentTitle, string parsedTorrentTorrentType, string? parsedTorrentYear, CancellationToken cancellationToken = default);
Task<ImdbEntry?> FindImdbMetadata(string? parsedTorrentTitle, string parsedTorrentTorrentType, int? parsedTorrentYear, CancellationToken cancellationToken = default);
Task InsertTorrent(Torrent torrent, CancellationToken cancellationToken = default);
Task InsertFiles(IEnumerable<TorrentFile> files, CancellationToken cancellationToken = default);
Task InsertSubtitles(IEnumerable<SubtitleFile> subtitles, CancellationToken cancellationToken = default);

View File

@@ -0,0 +1,14 @@
namespace SharedContracts.Extensions;
public static class JsonExtensions
{
private static readonly JsonSerializerOptions JsonSerializerOptions = new()
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
WriteIndented = false,
ReferenceHandler = ReferenceHandler.IgnoreCycles,
NumberHandling = JsonNumberHandling.Strict,
};
public static string AsJson<T>(this T obj) => JsonSerializer.Serialize(obj, JsonSerializerOptions);
}

View File

@@ -1,9 +1,7 @@
// Global using directives
global using System.Collections.Concurrent;
global using System.Globalization;
global using System.Text;
global using System.Text.Json;
global using System.Text.Json.Serialization;
global using Dapper;
global using MassTransit;
global using Microsoft.AspNetCore.Builder;
@@ -17,4 +15,4 @@ global using Python.Runtime;
global using Serilog;
global using SharedContracts.Configuration;
global using SharedContracts.Extensions;
global using SharedContracts.Models;
global using SharedContracts.Models;

View File

@@ -7,4 +7,5 @@ public class ImdbEntry
public string? Category { get; set; }
public string? Year { get; set; }
public bool? Adult { get; set; }
public decimal? Score { get; set; }
}

View File

@@ -12,7 +12,9 @@ public class IngestedTorrent
public int Leechers { get; set; }
public string? Imdb { get; set; }
public bool Processed { get; set; } = false;
public bool Processed { get; set; }
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DateTime UpdatedAt { get; set; } = DateTime.UtcNow;
public string? RtnResponse { get; set; }
}

View File

@@ -3,6 +3,7 @@ namespace SharedContracts.Models;
public class Torrent
{
public string? InfoHash { get; set; }
public long? IngestedTorrentId { get; set; }
public string? Provider { get; set; }
public string? TorrentId { get; set; }
public string? Title { get; set; }

View File

@@ -7,7 +7,7 @@ public interface IPythonEngineService
Task InitializePythonEngine(CancellationToken cancellationToken);
T ExecuteCommandOrScript<T>(string command, PyModule module, bool throwOnErrors);
T ExecutePythonOperation<T>(Func<T> operation, string operationName, bool throwOnErrors);
T ExecutePythonOperationWithDefault<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors);
T ExecutePythonOperationWithDefault<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors, bool logErrors);
Task StopPythonEngine(CancellationToken cancellationToken);
dynamic? Sys { get; }
}

View File

@@ -53,10 +53,10 @@ public class PythonEngineService(ILogger<PythonEngineService> logger) : IPythonE
}, nameof(ExecuteCommandOrScript), throwOnErrors);
public T ExecutePythonOperation<T>(Func<T> operation, string operationName, bool throwOnErrors) =>
ExecutePythonOperationWithDefault(operation, default, operationName, throwOnErrors);
ExecutePythonOperationWithDefault(operation, default, operationName, throwOnErrors, true);
public T ExecutePythonOperationWithDefault<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors) =>
ExecutePythonOperationInternal(operation, defaultValue, operationName, throwOnErrors);
public T ExecutePythonOperationWithDefault<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors, bool logErrors) =>
ExecutePythonOperationInternal(operation, defaultValue, operationName, throwOnErrors, logErrors);
public void ExecuteOnGIL(Action act, bool throwOnErrors)
{
@@ -95,7 +95,7 @@ public class PythonEngineService(ILogger<PythonEngineService> logger) : IPythonE
}
// ReSharper disable once EntityNameCapturedOnly.Local
private T ExecutePythonOperationInternal<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors)
private T ExecutePythonOperationInternal<T>(Func<T> operation, T? defaultValue, string operationName, bool throwOnErrors, bool logErrors)
{
Sys ??= LoadSys();
@@ -108,7 +108,10 @@ public class PythonEngineService(ILogger<PythonEngineService> logger) : IPythonE
}
catch (Exception ex)
{
Logger.LogError(ex, "Python Error: {Message} ({OperationName})", ex.Message, nameof(operationName));
if (logErrors)
{
Logger.LogError(ex, "Python Error: {Message} ({OperationName})", ex.Message, nameof(operationName));
}
if (throwOnErrors)
{

View File

@@ -1,6 +1,3 @@
namespace SharedContracts.Python.RTN;
public record ParseTorrentTitleResponse(bool Success, string ParsedTitle, int Year, int[]? Season = null, int[]? Episode = null)
{
public bool IsMovie => Season == null && Episode == null;
}
public record ParseTorrentTitleResponse(bool Success, RtnResponse? Response);

View File

@@ -14,34 +14,31 @@ public class RankTorrentName : IRankTorrentName
}
public ParseTorrentTitleResponse Parse(string title) =>
_pythonEngineService.ExecutePythonOperation(
_pythonEngineService.ExecutePythonOperationWithDefault(
() =>
{
var result = _rtn?.parse(title);
return ParseResult(result);
}, nameof(Parse), throwOnErrors: false);
}, new ParseTorrentTitleResponse(false, null), nameof(Parse), throwOnErrors: false, logErrors: false);
private static ParseTorrentTitleResponse ParseResult(dynamic result)
{
if (result == null)
{
return new(false, string.Empty, 0);
return new(false, null);
}
var json = result.model_dump_json()?.As<string?>();
if (json is null || string.IsNullOrEmpty(json))
{
return new(false, null);
}
var parsedTitle = result.GetAttr("parsed_title")?.As<string>() ?? string.Empty;
var year = result.GetAttr("year")?.As<int>() ?? 0;
var seasons = GetIntArray(result, "season");
var episodes = GetIntArray(result, "episode");
var response = JsonSerializer.Deserialize<RtnResponse>(json);
return new ParseTorrentTitleResponse(true, parsedTitle, year, seasons, episodes);
}
private static int[]? GetIntArray(dynamic result, string field)
{
var theList = result.GetAttr(field)?.As<PyList>();
int[]? results = theList?.Length() > 0 ? theList.As<int[]>() : null;
return results;
return new(true, response);
}
private void InitModules() =>

View File

@@ -0,0 +1,83 @@
namespace SharedContracts.Python.RTN;
public class RtnResponse
{
[JsonPropertyName("raw_title")]
public string? RawTitle { get; set; }
[JsonPropertyName("parsed_title")]
public string? ParsedTitle { get; set; }
[JsonPropertyName("fetch")]
public bool Fetch { get; set; }
[JsonPropertyName("is_4k")]
public bool Is4K { get; set; }
[JsonPropertyName("is_multi_audio")]
public bool IsMultiAudio { get; set; }
[JsonPropertyName("is_multi_subtitle")]
public bool IsMultiSubtitle { get; set; }
[JsonPropertyName("is_complete")]
public bool IsComplete { get; set; }
[JsonPropertyName("year")]
public int Year { get; set; }
[JsonPropertyName("resolution")]
public List<string>? Resolution { get; set; }
[JsonPropertyName("quality")]
public List<string>? Quality { get; set; }
[JsonPropertyName("season")]
public List<int>? Season { get; set; }
[JsonPropertyName("episode")]
public List<int>? Episode { get; set; }
[JsonPropertyName("codec")]
public List<string>? Codec { get; set; }
[JsonPropertyName("audio")]
public List<string>? Audio { get; set; }
[JsonPropertyName("subtitles")]
public List<string>? Subtitles { get; set; }
[JsonPropertyName("language")]
public List<string>? Language { get; set; }
[JsonPropertyName("bit_depth")]
public List<int>? BitDepth { get; set; }
[JsonPropertyName("hdr")]
public string? Hdr { get; set; }
[JsonPropertyName("proper")]
public bool Proper { get; set; }
[JsonPropertyName("repack")]
public bool Repack { get; set; }
[JsonPropertyName("remux")]
public bool Remux { get; set; }
[JsonPropertyName("upscaled")]
public bool Upscaled { get; set; }
[JsonPropertyName("remastered")]
public bool Remastered { get; set; }
[JsonPropertyName("directors_cut")]
public bool DirectorsCut { get; set; }
[JsonPropertyName("extended")]
public bool Extended { get; set; }
public bool IsMovie => Season == null && Episode == null;
public string ToJson() => this.AsJson();
}