updates horriblesubs scraper to split multiple season entries

This commit is contained in:
TheBeastLT
2020-03-05 14:57:57 +01:00
parent cbfcea65f8
commit 51eb21f112
6 changed files with 47 additions and 14 deletions

View File

@@ -6,7 +6,7 @@ const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_dump_s
const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper');
const providers = [thepiratebayScraper]; const providers = [horribleSubsScraper];
async function scrape() { async function scrape() {
providers.forEach((provider) => provider.scrape()); providers.forEach((provider) => provider.scrape());

View File

@@ -67,18 +67,18 @@ function _requestMetadata(url) {
}); });
} }
function escapeTitle(title, hyphenEscape = true) { function escapeTitle(title) {
return title.toLowerCase() return title.toLowerCase()
.normalize('NFKD') // normalize non-ASCII characters .normalize('NFKD') // normalize non-ASCII characters
.replace(/[\u0300-\u036F]/g, '') .replace(/[\u0300-\u036F]/g, '')
.replace(/&/g, 'and') .replace(/&/g, 'and')
.replace(hyphenEscape ? /[.,_+ -]+/g : /[.,_+ ]+/g, ' ') // replace dots, commas or underscores with spaces .replace(/[;, ~\-]+/g, ' ') // replace dots, commas or underscores with spaces
.replace(/[^\w- ()]/gi, '') // remove all non-alphanumeric chars .replace(/[^\w ()+#@!']+/g, '') // remove all non-alphanumeric chars
.trim(); .trim();
} }
async function getImdbId(info, type) { async function getImdbId(info, type) {
const name = escapeTitle(info.title).toLowerCase(); const name = escapeTitle(info.title);
const year = info.year || info.date && info.date.slice(0, 4); const year = info.year || info.date && info.date.slice(0, 4);
const key = `${name}_${year}_${type}`; const key = `${name}_${year}_${type}`;
@@ -100,7 +100,7 @@ async function getImdbId(info, type) {
} }
async function getKitsuId(info) { async function getKitsuId(info) {
const title = escapeTitle(info.title).toLowerCase().replace(/[;]+/g, ' ').replace(/[,%']+/g, ''); const title = escapeTitle(info.title);
const season = info.season > 1 ? ` S${info.season}` : ''; const season = info.season > 1 ? ` S${info.season}` : '';
const query = `${title}${season}`; const query = `${title}${season}`;

View File

@@ -4,7 +4,7 @@ const { torrentFiles } = require('../lib/torrent');
const { escapeTitle, getMetadata, getImdbId } = require('../lib/metadata'); const { escapeTitle, getMetadata, getImdbId } = require('../lib/metadata');
const { Type } = require('./types'); const { Type } = require('./types');
const MIN_SIZE = 10 * 1024 * 1024; // 20 MB const MIN_SIZE = 10 * 1024 * 1024; // 10 MB
async function parseTorrentFiles(torrent) { async function parseTorrentFiles(torrent) {
const parsedTorrentName = parse(torrent.title); const parsedTorrentName = parse(torrent.title);

2
package-lock.json generated
View File

@@ -1714,7 +1714,7 @@
} }
}, },
"parse-torrent-title": { "parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#6767960f4c40c7c97fd2f8e191cf099ab7dcd1dd", "version": "git://github.com/TheBeastLT/parse-torrent-title.git#ddd5037820289d35e600baec9d8e730935d261af",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#master" "from": "git://github.com/TheBeastLT/parse-torrent-title.git#master"
}, },
"parseurl": { "parseurl": {

View File

@@ -68,8 +68,7 @@
"kitsu_id": "6508" "kitsu_id": "6508"
}, },
"Anohana (Live Action)": { "Anohana (Live Action)": {
"showId": "18", "showId": "18"
"kitsu_id": "5981"
}, },
"Another": { "Another": {
"showId": "19", "showId": "19",
@@ -153,7 +152,7 @@
}, },
"Binan Koukou Chikyuu Bouei-bu Love!": { "Binan Koukou Chikyuu Bouei-bu Love!": {
"showId": "38", "showId": "38",
"kitsu_id": "11329" "kitsu_id": "9173"
}, },
"Black Bullet": { "Black Bullet": {
"showId": "39", "showId": "39",

View File

@@ -34,6 +34,18 @@ async function _scrapeAllShows() {
.catch((err) => console.log(err))))); .catch((err) => console.log(err)))));
} }
async function compareSearchKitsuIds() {
console.log(`${NAME}: initiating kitsu compare...`);
const shows = await horriblesubs.allShows()
.then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
const incorrect = shows.filter(
(show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
const incorrectRatio = incorrect.length / shows.length;
console.log(incorrect);
console.log(`Ratio: ${incorrectRatio}`);
}
async function initMapping() { async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`); console.log(`${NAME}: initiating kitsu mapping...`);
const shows = await horriblesubs.allShows() const shows = await horriblesubs.allShows()
@@ -81,6 +93,28 @@ async function _parseShowData(showData) {
throw new Error(`No kitsuId found for ${showData.title}`); throw new Error(`No kitsuId found for ${showData.title}`);
} }
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
const kitsuIdsMapping = kitsuId.length && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
.then((metas) => metas.reduce((map, meta) => {
const epOffset = Object.keys(map).length;
[...Array(meta.totalCount).keys()]
.map(ep => ep + 1)
.forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
return map;
}, {})) || {};
const formatTitle = (episodeInfo, mirror) => {
const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
if (mapping) {
return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
}
return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
};
const getKitsuId = inputEpisode => {
const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
const episode = parseInt(episodeString, 10);
return kitsuIdsMapping[episode] && kitsuIdsMapping[episode].kitsuId || kitsuId;
};
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes) return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
.map((episodeInfo) => episodeInfo.mirrors .map((episodeInfo) => episodeInfo.mirrors
.map((mirror) => ({ .map((mirror) => ({
@@ -88,10 +122,10 @@ async function _parseShowData(showData) {
...mirror, ...mirror,
infoHash: decode(mirror.magnetLink).infoHash, infoHash: decode(mirror.magnetLink).infoHash,
trackers: decode(mirror.magnetLink).tr.join(','), trackers: decode(mirror.magnetLink).tr.join(','),
title: `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`, title: formatTitle(episodeInfo, mirror),
size: 300000000, size: 300000000,
type: Type.ANIME, type: Type.ANIME,
kitsuId: kitsuId, kitsuId: getKitsuId(episodeInfo.episode),
uploadDate: episodeInfo.uploadDate, uploadDate: episodeInfo.uploadDate,
}))) })))
.reduce((a, b) => a.concat(b), []) .reduce((a, b) => a.concat(b), [])
@@ -120,7 +154,7 @@ async function verifyFiles(torrent, files) {
} }
return files; return files;
} }
throw new Error(`No video files found for: ${torrent.title}`); return Promise.reject(`No video files found for: ${torrent.title}`);
} }
async function checkIfExists(torrent) { async function checkIfExists(torrent) {