diff --git a/index.js b/index.js index 24c5acb..091bf8f 100644 --- a/index.js +++ b/index.js @@ -6,7 +6,7 @@ const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_dump_s const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); -const providers = [thepiratebayDumpScraper]; +const providers = [thepiratebayScraper]; async function scrape() { providers.forEach((provider) => provider.scrape()); diff --git a/lib/metadata.js b/lib/metadata.js index 921e119..0531a38 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -77,19 +77,21 @@ function escapeTitle(title, hyphenEscape = true) { .trim(); } -async function getImdbId(info) { - const key = `${info.name}_${info.year}_${info.type}`; +async function getImdbId(info, type) { + const name = escapeTitle(info.title).toLowerCase(); + const year = info.year || info.date && info.date.slice(0, 4); + const key = `${name}_${year}_${type}`; return cacheWrapImdbId(key, () => new Promise((resolve, reject) => { - nameToImdb(info, function (err, res) { + nameToImdb({ name, year, type }, function (err, res) { if (res) { resolve(res); } else { reject(err || new Error('failed imdbId search')); } }); - }).catch(() => bing.web(`${info.name} ${info.year || ''} ${info.type} imdb`) + }).catch(() => bing.web(`${name} ${year || ''} ${type} imdb`) .then(results => results .map((result) => result.link) .find(result => result.includes('imdb.com/title/'))) @@ -98,8 +100,10 @@ async function getImdbId(info) { } async function getKitsuId(info) { - const title = info.season > 1 ? `${info.name} S${info.season}` : info.name; - const query = title.replace(/[;]+/g, ' ').replace(/[,%']+/g, ''); + const title = escapeTitle(info.title).toLowerCase().replace(/[;]+/g, ' ').replace(/[,%']+/g, ''); + const season = info.season > 1 ? ` S${info.season}` : ''; + const query = `${title}${season}`; + return cacheWrapImdbId(query, () => needle('get', `${KITSU_URL}/catalog/series/kitsu-anime-list/search=${query}.json`, { open_timeout: 60000 }) .then((response) => { @@ -112,4 +116,4 @@ async function getKitsuId(info) { })); } -module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuId }; +module.exports = { getMetadata, getImdbId, getKitsuId }; diff --git a/lib/torrentEntries.js b/lib/torrentEntries.js index 40cd0a2..f31cd5d 100644 --- a/lib/torrentEntries.js +++ b/lib/torrentEntries.js @@ -1,23 +1,22 @@ const { parse } = require('parse-torrent-title'); const { Type } = require('./types'); const repository = require('./repository'); -const { getImdbId, getKitsuId, escapeTitle } = require('./metadata'); +const { getImdbId, getKitsuId } = require('./metadata'); const { parseTorrentFiles } = require('./torrentFiles'); async function createTorrentEntry(torrent) { const titleInfo = parse(torrent.title); - const searchTitle = escapeTitle(titleInfo.title).toLowerCase(); if (titleInfo.seasons && torrent.type === Type.MOVIE) { // sometimes series torrent might be put into movies category torrent.type = Type.SERIES; } if (!torrent.imdbId && torrent.type !== Type.ANIME) { - torrent.imdbId = await getImdbId({ name: searchTitle, year: titleInfo.year, type: torrent.type }) + torrent.imdbId = await getImdbId(titleInfo, torrent.type) .catch(() => undefined); } if (!torrent.kitsuId && torrent.type === Type.ANIME) { - torrent.kitsuId = await getKitsuId({ name: searchTitle, season: titleInfo.season }) + torrent.kitsuId = await getKitsuId(titleInfo) .catch(() => undefined); } diff --git a/lib/torrentFiles.js b/lib/torrentFiles.js index d15e875..cc04d09 100644 --- a/lib/torrentFiles.js +++ b/lib/torrentFiles.js @@ -283,12 +283,7 @@ function assignKitsuOrImdbEpisodes(files, metadata) { function findMovieImdbId(title) { const parsedTitle = typeof title === 'string' ? parse(title) : title; - const searchQuery = { - name: escapeTitle(parsedTitle.title).toLowerCase(), - year: parsedTitle.year, - type: Type.MOVIE - }; - return getImdbId(searchQuery).catch((error) => undefined); + return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined); } function div100(episode) { diff --git a/manual/manual.js b/manual/manual.js index d82b73e..9f93692 100644 --- a/manual/manual.js +++ b/manual/manual.js @@ -168,5 +168,5 @@ async function findAllFiles() { //addMissingEpisodes().then(() => console.log('Finished')); //findAllFiles().then(() => console.log('Finished')); //updateMovieCollections().then(() => console.log('Finished')); -reapplyEpisodeDecomposing('87e7354028f2aaab56dfd0dabbab679a1b54c3c0', false).then(() => console.log('Finished')); +reapplyEpisodeDecomposing('83b61caa4191469a9c15ee851aff828184f9a78d', false).then(() => console.log('Finished')); //reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); \ No newline at end of file diff --git a/scrapers/horriblesubs/horriblesubs_scraper.js b/scrapers/horriblesubs/horriblesubs_scraper.js index fd40dcb..d275389 100644 --- a/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scrapers/horriblesubs/horriblesubs_scraper.js @@ -1,6 +1,5 @@ const fs = require('fs'); const Bottleneck = require('bottleneck'); -const { parse } = require('parse-torrent-title'); const decode = require('magnet-uri'); const horriblesubs = require('./horriblesubs_api.js'); const repository = require('../../lib/repository'); @@ -54,8 +53,8 @@ async function initMapping() { async function enrichShow(show) { console.log(`${NAME}: getting show info for ${show.title}...`); const showId = await horriblesubs._getShowId(show.url) - .catch((error) => show.title); - const metadata = await getKitsuId({ name: show.title }) + .catch(() => show.title); + const metadata = await getKitsuId({ title: show.title }) .then((kitsuId) => getMetadata(kitsuId)) .catch((error) => { console.log(`Failed getting kitsu meta: ${error.message}`); diff --git a/scrapers/thepiratebay/thepiratebay_dump_scraper.js b/scrapers/thepiratebay/thepiratebay_dump_scraper.js index cb817d5..32d808f 100644 --- a/scrapers/thepiratebay/thepiratebay_dump_scraper.js +++ b/scrapers/thepiratebay/thepiratebay_dump_scraper.js @@ -18,12 +18,12 @@ const limiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { const lastScraped = await repository.getProvider({ name: NAME }); const lastDump = { updatedAt: 2147000000 }; - const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); + //const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) { console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); - //await downloadDump(lastDump); + await downloadDump(lastDump); let entriesProcessed = 0; const lr = new LineByLineReader(CSV_FILE_PATH); @@ -51,10 +51,10 @@ async function scrape() { size: parseInt(row[3], 10) }; - if (torrent.uploadDate > checkPoint) { - entriesProcessed++; - return; - } + // if (torrent.uploadDate > checkPoint) { + // entriesProcessed++; + // return; + // } if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) { // torrent was already scraped previously, skipping @@ -75,8 +75,8 @@ async function scrape() { console.log(err); }); lr.on('end', () => { - fs.unlink(CSV_FILE_PATH); - repository.updateProvider({ name: NAME, lastScraped: lastDump.updatedAt }); + fs.unlink(CSV_FILE_PATH, (error) => console.warn(error)); + //repository.updateProvider({ name: NAME, lastScraped: lastDump.updatedAt }); console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`); }); } @@ -144,8 +144,8 @@ async function findTorrentInSource(record) { async function findTorrentViaBing(record) { return bing.web(`${record.infoHash}`) .then((results) => results - .find(result => result.description.includes('Direct download via magnet link') || result.description.includes( - 'Get this torrent'))) + .find(result => result.description.includes('Direct download via magnet link') || + result.description.includes('Get this torrent'))) .then((result) => { if (!result) { throw new Error(`Failed to find torrent ${record.title}`); diff --git a/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js b/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js index 3892a4c..99ec125 100644 --- a/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js +++ b/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js @@ -15,7 +15,7 @@ const limiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { console.log(`starting to scrape tpb dump...`); //const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); - const checkPoint = 951000; + const checkPoint = 4115000; let entriesProcessed = 0; const lr = new LineByLineReader(CSV_FILE_PATH);