From 51eb21f1128cb0f8dc0416056c0d5fa5bfddc3d4 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Thu, 5 Mar 2020 14:57:57 +0100 Subject: [PATCH] updates horriblesubs scraper to split multiple season entries --- index.js | 2 +- lib/metadata.js | 10 ++--- lib/torrentFiles.js | 2 +- package-lock.json | 2 +- .../horriblesubs/horriblesubs_mapping.json | 5 +-- scrapers/horriblesubs/horriblesubs_scraper.js | 40 +++++++++++++++++-- 6 files changed, 47 insertions(+), 14 deletions(-) diff --git a/index.js b/index.js index 091bf8f..44892bc 100644 --- a/index.js +++ b/index.js @@ -6,7 +6,7 @@ const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_dump_s const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); -const providers = [thepiratebayScraper]; +const providers = [horribleSubsScraper]; async function scrape() { providers.forEach((provider) => provider.scrape()); diff --git a/lib/metadata.js b/lib/metadata.js index 0531a38..0b8796b 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -67,18 +67,18 @@ function _requestMetadata(url) { }); } -function escapeTitle(title, hyphenEscape = true) { +function escapeTitle(title) { return title.toLowerCase() .normalize('NFKD') // normalize non-ASCII characters .replace(/[\u0300-\u036F]/g, '') .replace(/&/g, 'and') - .replace(hyphenEscape ? /[.,_+ -]+/g : /[.,_+ ]+/g, ' ') // replace dots, commas or underscores with spaces - .replace(/[^\w- ()]/gi, '') // remove all non-alphanumeric chars + .replace(/[;, ~\-]+/g, ' ') // replace dots, commas or underscores with spaces + .replace(/[^\w ()+#@!']+/g, '') // remove all non-alphanumeric chars .trim(); } async function getImdbId(info, type) { - const name = escapeTitle(info.title).toLowerCase(); + const name = escapeTitle(info.title); const year = info.year || info.date && info.date.slice(0, 4); const key = `${name}_${year}_${type}`; @@ -100,7 +100,7 @@ async function getImdbId(info, type) { } async function getKitsuId(info) { - const title = escapeTitle(info.title).toLowerCase().replace(/[;]+/g, ' ').replace(/[,%']+/g, ''); + const title = escapeTitle(info.title); const season = info.season > 1 ? ` S${info.season}` : ''; const query = `${title}${season}`; diff --git a/lib/torrentFiles.js b/lib/torrentFiles.js index cc04d09..08ba49f 100644 --- a/lib/torrentFiles.js +++ b/lib/torrentFiles.js @@ -4,7 +4,7 @@ const { torrentFiles } = require('../lib/torrent'); const { escapeTitle, getMetadata, getImdbId } = require('../lib/metadata'); const { Type } = require('./types'); -const MIN_SIZE = 10 * 1024 * 1024; // 20 MB +const MIN_SIZE = 10 * 1024 * 1024; // 10 MB async function parseTorrentFiles(torrent) { const parsedTorrentName = parse(torrent.title); diff --git a/package-lock.json b/package-lock.json index 95d821e..1f7a7c5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1714,7 +1714,7 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#6767960f4c40c7c97fd2f8e191cf099ab7dcd1dd", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#ddd5037820289d35e600baec9d8e730935d261af", "from": "git://github.com/TheBeastLT/parse-torrent-title.git#master" }, "parseurl": { diff --git a/scrapers/horriblesubs/horriblesubs_mapping.json b/scrapers/horriblesubs/horriblesubs_mapping.json index c6fb611..af116b1 100644 --- a/scrapers/horriblesubs/horriblesubs_mapping.json +++ b/scrapers/horriblesubs/horriblesubs_mapping.json @@ -68,8 +68,7 @@ "kitsu_id": "6508" }, "Anohana (Live Action)": { - "showId": "18", - "kitsu_id": "5981" + "showId": "18" }, "Another": { "showId": "19", @@ -153,7 +152,7 @@ }, "Binan Koukou Chikyuu Bouei-bu Love!": { "showId": "38", - "kitsu_id": "11329" + "kitsu_id": "9173" }, "Black Bullet": { "showId": "39", diff --git a/scrapers/horriblesubs/horriblesubs_scraper.js b/scrapers/horriblesubs/horriblesubs_scraper.js index d275389..a67f220 100644 --- a/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scrapers/horriblesubs/horriblesubs_scraper.js @@ -34,6 +34,18 @@ async function _scrapeAllShows() { .catch((err) => console.log(err))))); } +async function compareSearchKitsuIds() { + console.log(`${NAME}: initiating kitsu compare...`); + const shows = await horriblesubs.allShows() + .then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show))))); + + const incorrect = shows.filter( + (show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id); + const incorrectRatio = incorrect.length / shows.length; + console.log(incorrect); + console.log(`Ratio: ${incorrectRatio}`); +} + async function initMapping() { console.log(`${NAME}: initiating kitsu mapping...`); const shows = await horriblesubs.allShows() @@ -81,6 +93,28 @@ async function _parseShowData(showData) { throw new Error(`No kitsuId found for ${showData.title}`); } + // sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry + const kitsuIdsMapping = kitsuId.length && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId))) + .then((metas) => metas.reduce((map, meta) => { + const epOffset = Object.keys(map).length; + [...Array(meta.totalCount).keys()] + .map(ep => ep + 1) + .forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title }); + return map; + }, {})) || {}; + const formatTitle = (episodeInfo, mirror) => { + const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')]; + if (mapping) { + return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`; + } + return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`; + }; + const getKitsuId = inputEpisode => { + const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode; + const episode = parseInt(episodeString, 10); + return kitsuIdsMapping[episode] && kitsuIdsMapping[episode].kitsuId || kitsuId; + }; + return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes) .map((episodeInfo) => episodeInfo.mirrors .map((mirror) => ({ @@ -88,10 +122,10 @@ async function _parseShowData(showData) { ...mirror, infoHash: decode(mirror.magnetLink).infoHash, trackers: decode(mirror.magnetLink).tr.join(','), - title: `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`, + title: formatTitle(episodeInfo, mirror), size: 300000000, type: Type.ANIME, - kitsuId: kitsuId, + kitsuId: getKitsuId(episodeInfo.episode), uploadDate: episodeInfo.uploadDate, }))) .reduce((a, b) => a.concat(b), []) @@ -120,7 +154,7 @@ async function verifyFiles(torrent, files) { } return files; } - throw new Error(`No video files found for: ${torrent.title}`); + return Promise.reject(`No video files found for: ${torrent.title}`); } async function checkIfExists(torrent) {