updates horriblesubs scraper to split multiple season entries

This commit is contained in:
TheBeastLT
2020-03-05 14:57:57 +01:00
parent cbfcea65f8
commit 51eb21f112
6 changed files with 47 additions and 14 deletions

View File

@@ -6,7 +6,7 @@ const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_dump_s
const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper');
const providers = [thepiratebayScraper];
const providers = [horribleSubsScraper];
async function scrape() {
providers.forEach((provider) => provider.scrape());

View File

@@ -67,18 +67,18 @@ function _requestMetadata(url) {
});
}
function escapeTitle(title, hyphenEscape = true) {
function escapeTitle(title) {
return title.toLowerCase()
.normalize('NFKD') // normalize non-ASCII characters
.replace(/[\u0300-\u036F]/g, '')
.replace(/&/g, 'and')
.replace(hyphenEscape ? /[.,_+ -]+/g : /[.,_+ ]+/g, ' ') // replace dots, commas or underscores with spaces
.replace(/[^\w- ()]/gi, '') // remove all non-alphanumeric chars
.replace(/[;, ~\-]+/g, ' ') // replace dots, commas or underscores with spaces
.replace(/[^\w ()+#@!']+/g, '') // remove all non-alphanumeric chars
.trim();
}
async function getImdbId(info, type) {
const name = escapeTitle(info.title).toLowerCase();
const name = escapeTitle(info.title);
const year = info.year || info.date && info.date.slice(0, 4);
const key = `${name}_${year}_${type}`;
@@ -100,7 +100,7 @@ async function getImdbId(info, type) {
}
async function getKitsuId(info) {
const title = escapeTitle(info.title).toLowerCase().replace(/[;]+/g, ' ').replace(/[,%']+/g, '');
const title = escapeTitle(info.title);
const season = info.season > 1 ? ` S${info.season}` : '';
const query = `${title}${season}`;

View File

@@ -4,7 +4,7 @@ const { torrentFiles } = require('../lib/torrent');
const { escapeTitle, getMetadata, getImdbId } = require('../lib/metadata');
const { Type } = require('./types');
const MIN_SIZE = 10 * 1024 * 1024; // 20 MB
const MIN_SIZE = 10 * 1024 * 1024; // 10 MB
async function parseTorrentFiles(torrent) {
const parsedTorrentName = parse(torrent.title);

2
package-lock.json generated
View File

@@ -1714,7 +1714,7 @@
}
},
"parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#6767960f4c40c7c97fd2f8e191cf099ab7dcd1dd",
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#ddd5037820289d35e600baec9d8e730935d261af",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#master"
},
"parseurl": {

View File

@@ -68,8 +68,7 @@
"kitsu_id": "6508"
},
"Anohana (Live Action)": {
"showId": "18",
"kitsu_id": "5981"
"showId": "18"
},
"Another": {
"showId": "19",
@@ -153,7 +152,7 @@
},
"Binan Koukou Chikyuu Bouei-bu Love!": {
"showId": "38",
"kitsu_id": "11329"
"kitsu_id": "9173"
},
"Black Bullet": {
"showId": "39",

View File

@@ -34,6 +34,18 @@ async function _scrapeAllShows() {
.catch((err) => console.log(err)))));
}
async function compareSearchKitsuIds() {
console.log(`${NAME}: initiating kitsu compare...`);
const shows = await horriblesubs.allShows()
.then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
const incorrect = shows.filter(
(show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
const incorrectRatio = incorrect.length / shows.length;
console.log(incorrect);
console.log(`Ratio: ${incorrectRatio}`);
}
async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`);
const shows = await horriblesubs.allShows()
@@ -81,6 +93,28 @@ async function _parseShowData(showData) {
throw new Error(`No kitsuId found for ${showData.title}`);
}
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
const kitsuIdsMapping = kitsuId.length && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
.then((metas) => metas.reduce((map, meta) => {
const epOffset = Object.keys(map).length;
[...Array(meta.totalCount).keys()]
.map(ep => ep + 1)
.forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
return map;
}, {})) || {};
const formatTitle = (episodeInfo, mirror) => {
const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
if (mapping) {
return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
}
return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
};
const getKitsuId = inputEpisode => {
const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
const episode = parseInt(episodeString, 10);
return kitsuIdsMapping[episode] && kitsuIdsMapping[episode].kitsuId || kitsuId;
};
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
.map((episodeInfo) => episodeInfo.mirrors
.map((mirror) => ({
@@ -88,10 +122,10 @@ async function _parseShowData(showData) {
...mirror,
infoHash: decode(mirror.magnetLink).infoHash,
trackers: decode(mirror.magnetLink).tr.join(','),
title: `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`,
title: formatTitle(episodeInfo, mirror),
size: 300000000,
type: Type.ANIME,
kitsuId: kitsuId,
kitsuId: getKitsuId(episodeInfo.episode),
uploadDate: episodeInfo.uploadDate,
})))
.reduce((a, b) => a.concat(b), [])
@@ -120,7 +154,7 @@ async function verifyFiles(torrent, files) {
}
return files;
}
throw new Error(`No video files found for: ${torrent.title}`);
return Promise.reject(`No video files found for: ${torrent.title}`);
}
async function checkIfExists(torrent) {