diff --git a/scraper/index.js b/scraper/index.js index 75091dc..cc00b10 100644 --- a/scraper/index.js +++ b/scraper/index.js @@ -14,7 +14,7 @@ const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepi const PROVIDERS = [ // horribleSubsScraper, - rarbgScraper, + // rarbgScraper, thepiratebayScraper, kickassScraper, leetxScraper @@ -42,7 +42,7 @@ function enableScheduling() { } server.get('/', function (req, res) { - res.send(200); + res.sendStatus(200); }); server.listen(process.env.PORT || 7000, async () => { diff --git a/scraper/lib/metadata.js b/scraper/lib/metadata.js index a0fd922..43042d6 100644 --- a/scraper/lib/metadata.js +++ b/scraper/lib/metadata.js @@ -96,10 +96,11 @@ async function getImdbId(info, type) { }); }).catch(() => bing.web(`${name} ${year || ''} ${type} imdb`) .then(results => results - .map((result) => result.link) + .map(result => result.link) .find(result => result.includes('imdb.com/title/'))) .then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/)) - .then(match => match && match[1]))); + .then(match => match && match[1]))) + .then(imdbId => 'tt' + imdbId.replace(/tt0*([1-9][0-9]*)$/, '$1').padStart(7, '0')); } async function getKitsuId(info) { diff --git a/scraper/lib/repository.js b/scraper/lib/repository.js index c6e748f..e498f63 100644 --- a/scraper/lib/repository.js +++ b/scraper/lib/repository.js @@ -1,9 +1,14 @@ -const { Sequelize } = require('sequelize'); +const { Sequelize, fn, col } = require('sequelize'); const Op = Sequelize.Op; const DATABASE_URI = process.env.DATABASE_URI; -const database = new Sequelize(DATABASE_URI, { logging: false }); +const database = new Sequelize( + DATABASE_URI, + { + logging: false + } +); const Provider = database.define('provider', { name: { type: Sequelize.STRING(32), primaryKey: true }, @@ -47,6 +52,19 @@ const File = database.define('file', }, { indexes: [ + { + unique: true, + name: 'files_unique_file_constraint', + fields: [ + col('infoHash'), + fn('COALESCE', (col('fileIndex')), -1), + fn('COALESCE', (col('imdbId')), 'null'), + fn('COALESCE', (col('imdbSeason')), -1), + fn('COALESCE', (col('imdbEpisode')), -1), + fn('COALESCE', (col('kitsuId')), -1), + fn('COALESCE', (col('kitsuEpisode')), -1) + ] + }, { unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] }, { unique: false, fields: ['kitsuId', 'kitsuEpisode'] } ] @@ -63,7 +81,11 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', { }); function connect() { - return database.sync({ alter: true }); + return database.sync({ alter: true }) + .catch(error => { + console.error('Failed syncing database: ', error); + throw error; + }); } function getProvider(provider) { diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index 780e836..e0aaaf8 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -111,7 +111,7 @@ async function filesFromTorrentStream(torrent) { function filterVideos(files) { return files.filter((file) => { const match = file.path.match(/\.(\w{2,4})$/); - return match && EXTENSIONS.includes(match[1]); + return match && EXTENSIONS.includes(match[1].toLowerCase()); }); } diff --git a/scraper/lib/torrentFiles.js b/scraper/lib/torrentFiles.js index 3c28338..e8cf43e 100644 --- a/scraper/lib/torrentFiles.js +++ b/scraper/lib/torrentFiles.js @@ -62,8 +62,9 @@ async function parseTorrentFiles(torrent) { } async function getSeriesFiles(torrent, parsedTorrentName) { - if ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) || - (!parsedTorrentName.episodes && parsedTorrentName.date)) { + if (!parsedTorrentName.complete && !parsedTorrentName.hasMovies && + ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) || + (!parsedTorrentName.episodes && parsedTorrentName.date))) { return [{ name: torrent.title, path: torrent.title, diff --git a/scraper/manual/manual.js b/scraper/manual/manual.js index 0b14219..fa7fa5c 100644 --- a/scraper/manual/manual.js +++ b/scraper/manual/manual.js @@ -2,6 +2,7 @@ require('dotenv').config(); const Bottleneck = require('bottleneck'); const { parse } = require('parse-torrent-title'); const repository = require('../lib/repository'); +const { getImdbId } = require('../lib/metadata'); const { parseTorrentFiles } = require('../lib/torrentFiles'); const { Type } = require('../lib/types'); @@ -15,7 +16,7 @@ async function addMissingEpisodes() { const imdbId = Object.values(storedFiles)[0].imdbId; torrentFiles - .filter((file) => !storedFiles[file.fileIndex]) + .filter((file) => !storedFiles[file.fileIndex !== undefined ? file.fileIndex : null]) .map((file) => ({ infoHash: torrent.infoHash, fileIndex: file.fileIndex, @@ -63,11 +64,22 @@ async function reapplySeriesSeasonsSavedAsMovies() { .then(() => console.log('Finished updating multiple torrents')); } +async function reapplyDecomposingToTorrentsOnRegex(regex) { + return repository.getTorrentsBasedOnTitle(regex, Type.ANIME) + .then(torrents => Promise.all(torrents + .map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, true))))) + .then(() => console.log('Finished updating multiple torrents')); +} + async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) { const torrent = await repository.getTorrent({ infoHash }); const storedFiles = await repository.getFiles({ infoHash }); const fileIndexMap = storedFiles - .reduce((map, next) => (map[next.fileIndex] = (map[next.fileIndex] || []).concat(next), map), {}); + .reduce((map, next) => { + const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null; + map[fileIndex] = (map[fileIndex] || []).concat(next); + return map; + }, {}); const files = includeSourceFiles && Object.values(fileIndexMap) .map(sameIndexFiles => sameIndexFiles[0]) .map(file => ({ @@ -76,12 +88,14 @@ async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) { path: file.title, size: file.size })); - const imdbId = storedFiles[0].imdbId; + const imdbId = storedFiles.length && storedFiles[0].imdbId || await getImdbId(parse(torrent.title)); return parseTorrentFiles({ ...torrent, imdbId, files }) .then(newFiles => newFiles.map(file => { - if (fileIndexMap[file.fileIndex]) { - const originalFile = fileIndexMap[file.fileIndex].shift(); + const fileIndex = file.fileIndex !== undefined ? file.fileIndex : null; + const mapping = fileIndexMap[fileIndex]; + if (mapping) { + const originalFile = mapping.shift(); if (originalFile) { if (!originalFile.imdbId) { originalFile.imdbId = file.imdbId @@ -176,5 +190,7 @@ async function findAllFiles() { //addMissingEpisodes().then(() => console.log('Finished')); //findAllFiles().then(() => console.log('Finished')); //updateMovieCollections().then(() => console.log('Finished')); -reapplyEpisodeDecomposing('d71c4fd1cb9bb9c5365a570b903a3a58774f61a5', true).then(() => console.log('Finished')); -//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); \ No newline at end of file +reapplyEpisodeDecomposing('aec7bcac457ad68924e7119f859cf6fa3878f9f5', false).then(() => console.log('Finished')); +//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); +// reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished')); +//reapplyManualHashes().then(() => console.log('Finished')); \ No newline at end of file diff --git a/scraper/scrapers/1337x/1337x_scraper.js b/scraper/scrapers/1337x/1337x_scraper.js index 91f3036..a32b910 100644 --- a/scraper/scrapers/1337x/1337x_scraper.js +++ b/scraper/scrapers/1337x/1337x_scraper.js @@ -20,7 +20,7 @@ async function scrape() { return scrapeLatestTorrents() .then(() => { lastScrape.lastScraped = scrapeStart; - return repository.updateProvider(lastScrape); + return lastScrape.save(); }) .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } @@ -40,14 +40,15 @@ async function scrapeLatestTorrents() { async function scrapeLatestTorrentsForCategory(category, page = 1) { console.log(`Scrapping ${NAME} ${category} category page ${page}`); return leetx.browse(({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(resolved => resolved.length > 0 && page < UNTIL_PAGE ? scrapeLatestTorrentsForCategory(category, page + 1) - : Promise.resolve()) - .catch(error => { - console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); - return Promise.resolve(); - }); + : Promise.resolve()); + } async function processTorrentRecord(record) { diff --git a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js index 882b862..f2ccf7f 100644 --- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js @@ -11,7 +11,7 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata'); const showMappings = require('./horriblesubs_mapping.json'); const NAME = 'HorribleSubs'; -const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days; +const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days; const limiter = new Bottleneck({ maxConcurrent: 5 }); const entryLimiter = new Bottleneck({ maxConcurrent: 10 }); @@ -26,7 +26,7 @@ async function scrape() { return _scrapeAllShows() .then(() => { lastScrape.lastScraped = scrapeStart; - return repository.updateProvider(lastScrape); + return lastScrape.save(); }) .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`)); } else { @@ -140,7 +140,7 @@ async function _parseShowData(showData) { return kitsuId; }; - return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes) + return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || []) .map((episodeInfo) => episodeInfo.mirrors .filter((mirror) => mirror.magnetLink && mirror.magnetLink.length) .map((mirror) => ({ @@ -169,15 +169,23 @@ async function _parseShowData(showData) { async function verifyFiles(torrent, files) { if (files && files.length) { const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash }) - .then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {})) + .then((existing) => existing + .reduce((map, next) => { + const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null; + map[fileIndex] = (map[fileIndex] || []).concat(next); + return map; + }, {})) .catch(() => undefined); if (existingFiles && Object.keys(existingFiles).length) { return files - .map(file => ({ - ...file, - id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id, - size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size - })) + .map(file => { + const mapping = existingFiles[file.fileIndex !== undefined ? file.fileIndex : null]; + if (mapping) { + const originalFile = mapping.shift(); + return { ...file, id: originalFile.id, size: originalFile.size || file.size }; + } + return file; + }) } return files; } diff --git a/scraper/scrapers/kickass/kickass_scraper.js b/scraper/scrapers/kickass/kickass_scraper.js index 8d76fe3..367b6b1 100644 --- a/scraper/scrapers/kickass/kickass_scraper.js +++ b/scraper/scrapers/kickass/kickass_scraper.js @@ -20,7 +20,7 @@ async function scrape() { return scrapeLatestTorrents() .then(() => { lastScrape.lastScraped = scrapeStart; - return repository.updateProvider(lastScrape); + return lastScrape.save(); }) .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } @@ -39,14 +39,14 @@ async function scrapeLatestTorrents() { async function scrapeLatestTorrentsForCategory(category, page = 1) { console.log(`Scrapping ${NAME} ${category} category page ${page}`); return kickass.browse(({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(resolved => resolved.length > 0 && page < UNTIL_PAGE ? scrapeLatestTorrentsForCategory(category, page + 1) - : Promise.resolve()) - .catch(error => { - console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); - return Promise.resolve(); - }); + : Promise.resolve()); } async function processTorrentRecord(record) { diff --git a/scraper/scrapers/rarbg/rarbg_dump_scraper.js b/scraper/scrapers/rarbg/rarbg_dump_scraper.js index 8428021..4534886 100644 --- a/scraper/scrapers/rarbg/rarbg_dump_scraper.js +++ b/scraper/scrapers/rarbg/rarbg_dump_scraper.js @@ -16,12 +16,13 @@ const entryLimiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { console.log(`[${moment()}] starting ${NAME} dump scrape...`); - const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json'); - const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json'); - const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds); + //const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json'); + const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json').slice(800); + //const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds); - return Promise.all(allImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId) - .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))) + return Promise.all( + seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId)) + .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))))) .then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`)); } diff --git a/scraper/scrapers/rarbg/rarbg_scraper.js b/scraper/scrapers/rarbg/rarbg_scraper.js index f30a12a..eea108a 100644 --- a/scraper/scrapers/rarbg/rarbg_scraper.js +++ b/scraper/scrapers/rarbg/rarbg_scraper.js @@ -20,7 +20,7 @@ async function scrape() { return scrapeLatestTorrents() .then(() => { lastScrape.lastScraped = scrapeStart; - return repository.updateProvider(lastScrape); + return lastScrape.save(); }) .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } @@ -64,8 +64,8 @@ async function scrapeLatestTorrentsForCategory(category) { }))) .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))) .catch(error => { - console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); - return Promise.resolve(); + console.warn(`Failed ${NAME} scrapping for ${category} due: `, error); + return Promise.resolve([]); }); } diff --git a/scraper/scrapers/thepiratebay/thepiratebay_api.js b/scraper/scrapers/thepiratebay/thepiratebay_api.js index 3754a91..6e76243 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_api.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_api.js @@ -6,9 +6,9 @@ const Promises = require('../../lib/promises'); const defaultProxies = [ 'https://thepiratebay.org', - 'https://piratebays.icu', - 'https://piratebays.cool', - 'https://piratebays.life']; + 'https://proxybay.pro', + 'https://ukpiratebayproxy.com', + 'https://thepiratebayproxy.info']; const dumpUrl = '/static/dump/csv/'; const defaultTimeout = 10000; @@ -169,15 +169,18 @@ function parseBody(body) { $('table[id=\'searchResult\'] tr').each(function () { const name = $(this).find('.detLink').text(); const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/); + const magnetLink = $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'); if (!name || !sizeMatcher) { return; } torrents.push({ - torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1], name: name, + magnetLink: magnetLink, + infoHash: decode(magnetLink).infoHash, + torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1], seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10), leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10), - magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'), + category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], diff --git a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js index d4b2854..4b5dc28 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js @@ -32,7 +32,7 @@ async function scrape() { return scrapeLatestTorrents() .then(() => { lastScrape.lastScraped = scrapeStart; - return repository.updateProvider(lastScrape); + return lastScrape.save(); }) .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } @@ -45,14 +45,14 @@ async function scrapeLatestTorrents() { async function scrapeLatestTorrentsForCategory(category, page = 1) { console.log(`Scrapping ${NAME} ${category} category page ${page}`); return thepiratebay.browse(({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(resolved => resolved.length > 0 && page < UNTIL_PAGE ? scrapeLatestTorrentsForCategory(category, page + 1) - : Promise.resolve()) - .catch(error => { - console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); - return Promise.resolve(); - }); + : Promise.resolve()); } async function processTorrentRecord(record) {