trigram performance increased and housekeeping (#184)

* add new indexes, and change year column to int

* Change gist to gin, and change year to int

* Producer changes for new gin query

* Fully map the rtn response using json dump from Pydantic

Also updates Rtn to 0.1.9

* Add housekeeping script to reconcile imdb ids.

* Join Torrent onto the ingested torrent table

Ensure that a torrent can always find the details of where it came from, and how it was parsed.

* Version bump for release

* missing quote on table name
This commit is contained in:
iPromKnight
2024-03-29 19:01:48 +00:00
committed by GitHub
parent 2d78dc2735
commit 66609c2a46
23 changed files with 303 additions and 102 deletions

View File

@@ -0,0 +1,35 @@
-- Purpose: Change the year column to integer and add a search function that allows for searching by year.
ALTER TABLE imdb_metadata
ALTER COLUMN year TYPE integer USING (CASE WHEN year = '\N' THEN 0 ELSE year::integer END);
-- Remove the old search function
DROP FUNCTION IF EXISTS search_imdb_meta(TEXT, TEXT, TEXT, INT);
-- Add the new search function that allows for searching by year with a plus/minus one year range
CREATE OR REPLACE FUNCTION search_imdb_meta(search_term TEXT, category_param TEXT DEFAULT NULL, year_param INT DEFAULT NULL, limit_param INT DEFAULT 10)
RETURNS TABLE(imdb_id character varying(16), title character varying(1000),category character varying(50),year INT, score REAL) AS $$
BEGIN
SET pg_trgm.similarity_threshold = 0.9;
RETURN QUERY
SELECT imdb_metadata.imdb_id, imdb_metadata.title, imdb_metadata.category, imdb_metadata.year, similarity(imdb_metadata.title, search_term) as score
FROM imdb_metadata
WHERE (imdb_metadata.title % search_term)
AND (imdb_metadata.adult = FALSE)
AND (category_param IS NULL OR imdb_metadata.category = category_param)
AND (year_param IS NULL OR imdb_metadata.year BETWEEN year_param - 1 AND year_param + 1)
ORDER BY score DESC
LIMIT limit_param;
END; $$
LANGUAGE plpgsql;
-- Drop the old indexes
DROP INDEX IF EXISTS idx_imdb_metadata_adult;
DROP INDEX IF EXISTS idx_imdb_metadata_category;
DROP INDEX IF EXISTS idx_imdb_metadata_year;
DROP INDEX IF EXISTS title_gist;
-- Add indexes for the new columns
CREATE INDEX idx_imdb_metadata_adult ON imdb_metadata(adult);
CREATE INDEX idx_imdb_metadata_category ON imdb_metadata(category);
CREATE INDEX idx_imdb_metadata_year ON imdb_metadata(year);
CREATE INDEX title_gin ON imdb_metadata USING gin(title gin_trgm_ops);

View File

@@ -0,0 +1,40 @@
-- Purpose: Add the jsonb column to the ingested_torrents table to store the response from RTN
ALTER TABLE ingested_torrents
ADD COLUMN IF NOT EXISTS rtn_response jsonb;
-- Purpose: Drop torrentId column from torrents table
ALTER TABLE torrents
DROP COLUMN IF EXISTS "torrentId";
-- Purpose: Drop Trackers column from torrents table
ALTER TABLE torrents
DROP COLUMN IF EXISTS "trackers";
-- Purpose: Create a foreign key relationsship if it does not already exist between torrents and the source table ingested_torrents, but do not cascade on delete.
ALTER TABLE torrents
ADD COLUMN IF NOT EXISTS "ingestedTorrentId" bigint;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM information_schema.table_constraints
WHERE constraint_name = 'fk_torrents_info_hash'
)
THEN
ALTER TABLE torrents
DROP CONSTRAINT fk_torrents_info_hash;
END IF;
END $$;
ALTER TABLE torrents
ADD CONSTRAINT fk_torrents_info_hash
FOREIGN KEY ("ingestedTorrentId")
REFERENCES ingested_torrents("id")
ON DELETE NO ACTION;
UPDATE torrents
SET "ingestedTorrentId" = ingested_torrents."id"
FROM ingested_torrents
WHERE torrents."infoHash" = ingested_torrents."info_hash"
AND torrents."provider" = ingested_torrents."source";

View File

@@ -0,0 +1,55 @@
DROP FUNCTION IF EXISTS kc_maintenance_reconcile_dmm_imdb_ids();
CREATE OR REPLACE FUNCTION kc_maintenance_reconcile_dmm_imdb_ids()
RETURNS INTEGER AS $$
DECLARE
rec RECORD;
imdb_rec RECORD;
rows_affected INTEGER := 0;
BEGIN
RAISE NOTICE 'Starting Reconciliation of DMM IMDB Ids...';
FOR rec IN
SELECT
it."id" as "ingestion_id",
t."infoHash",
it."category" as "ingestion_category",
f."id" as "file_Id",
f."title" as "file_Title",
(rtn_response->>'raw_title')::text as "raw_title",
(rtn_response->>'parsed_title')::text as "parsed_title",
(rtn_response->>'year')::int as "year"
FROM torrents t
JOIN ingested_torrents it ON t."ingestedTorrentId" = it."id"
JOIN files f ON t."infoHash" = f."infoHash"
WHERE t."provider" = 'DMM'
LOOP
RAISE NOTICE 'Processing record with file_Id: %', rec."file_Id";
FOR imdb_rec IN
SELECT * FROM search_imdb_meta(
rec."parsed_title",
CASE
WHEN rec."ingestion_category" = 'tv' THEN 'tvSeries'
WHEN rec."ingestion_category" = 'movies' THEN 'movie'
END,
CASE
WHEN rec."year" = 0 THEN NULL
ELSE rec."year" END,
1)
LOOP
IF imdb_rec IS NOT NULL THEN
RAISE NOTICE 'Updating file_Id: % with imdbId: %, parsed title: %, imdb title: %', rec."file_Id", imdb_rec."imdb_id", rec."parsed_title", imdb_rec."title";
UPDATE "files"
SET "imdbId" = imdb_rec."imdb_id"
WHERE "id" = rec."file_Id";
rows_affected := rows_affected + 1;
ELSE
RAISE NOTICE 'No IMDB ID found for file_Id: %, parsed title: %, imdb title: %, setting imdbId to NULL', rec."file_Id", rec."parsed_title", imdb_rec."title";
UPDATE "files"
SET "imdbId" = NULL
WHERE "id" = rec."file_Id";
END IF;
END LOOP;
END LOOP;
RAISE NOTICE 'Finished reconciliation. Total rows affected: %', rows_affected;
RETURN rows_affected;
END;
$$ LANGUAGE plpgsql;