From 9ea3932af134946ee034a3316ab751a44f771667 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Thu, 23 Apr 2020 16:33:08 +0200 Subject: [PATCH] [scraper] rework scraper scheduling and added seeders updating --- package-lock.json | 4 +- package.json | 3 +- scraper/README.md | 2 +- scraper/index.js | 64 +------------------ scraper/lib/torrent.js | 5 +- scraper/lib/torrentEntries.js | 6 +- scraper/manual/manual.js | 1 - scraper/scheduler/scheduler.js | 13 ++++ scraper/scheduler/scraper.js | 25 ++++++++ scraper/scheduler/scrapers.js | 22 +++++++ scraper/scheduler/seeders.js | 41 ++++++++++++ scraper/scrapers/1337x/1337x_scraper.js | 5 +- scraper/scrapers/eztv/eztv_api.js | 2 +- scraper/scrapers/eztv/eztv_scraper.js | 14 ++-- .../horriblesubs/horriblesubs_scraper.js | 3 +- scraper/scrapers/kickass/kickass_scraper.js | 5 +- scraper/scrapers/rarbg/rarbg_scraper.js | 12 +--- .../thepiratebay/thepiratebay_scraper.js | 5 +- scraper/scrapers/yts/yts_scraper.js | 4 +- 19 files changed, 128 insertions(+), 108 deletions(-) create mode 100644 scraper/scheduler/scheduler.js create mode 100644 scraper/scheduler/scraper.js create mode 100644 scraper/scheduler/scrapers.js create mode 100644 scraper/scheduler/seeders.js diff --git a/package-lock.json b/package-lock.json index b66438e..7269ab9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1676,8 +1676,8 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928", - "from": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb", + "from": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb", "requires": { "moment": "^2.24.0" } diff --git a/package.json b/package.json index 8aacbc2..9d14d0e 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,6 @@ "cache-manager": "^2.9.0", "cache-manager-mongodb": "^0.2.1", "cheerio": "^0.22.0", - "dotenv": "^8.2.0", "express": "^4.16.4", "google-it": "^1.5.0", "he": "^1.2.0", @@ -31,7 +30,7 @@ "node-schedule": "^1.3.2", "nodejs-bing": "^0.1.0", "parse-torrent": "^6.1.2", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb", "peer-search": "^0.6.x", "pg": "^7.8.2", "pg-hstore": "^2.3.2", diff --git a/scraper/README.md b/scraper/README.md index 2850312..d49c360 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -20,7 +20,7 @@ Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/ Movies sitemap ```json -{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4110]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width]","multiple":true,"regex":"tt[0-9]+","delay":0}]} +{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]} ``` TV sitemap diff --git a/scraper/index.js b/scraper/index.js index d75eb1d..3ad1aa1 100644 --- a/scraper/index.js +++ b/scraper/index.js @@ -1,65 +1,7 @@ -require('dotenv').config(); const express = require("express"); const server = express(); -const schedule = require('node-schedule'); -const { connect, getUpdateSeedersTorrents } = require('./lib/repository'); -const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper'); -const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); -const ytsScraper = require('./scrapers/yts/yts_scraper'); -const eztvScraper = require('./scrapers/eztv/eztv_scraper'); -const leetxScraper = require('./scrapers/1337x/1337x_scraper'); -const kickassScraper = require('./scrapers/kickass/kickass_scraper'); -const rarbgScraper = require('./scrapers/rarbg/rarbg_scraper'); -const rarbgDumpScraper = require('./scrapers/rarbg/rarbg_dump_scraper'); -const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper'); -const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); - -const PROVIDERS = [ - // require('./scrapers/thepiratebay/thepiratebay_update_size_scraper') - // require('./scrapers/1337x/1337x_dump_scraper') - ytsScraper, - eztvScraper, - horribleSubsScraper, - rarbgScraper, - thepiratebayScraper, - kickassScraper, - leetxScraper - // rarbgDumpScraper - // thepiratebayDumpScraper - // thepiratebayUnofficialDumpScraper -]; -const SCRAPE_CRON = process.env.SCRAPE_CRON || '0 0 */4 ? * *'; // every 4 hours -const SEEDERS_CRON = '0 */2 * ? * *'; // every 2 minutes - -async function scrape() { - return PROVIDERS - .reduce(async (previousPromise, nextProvider) => { - await previousPromise; - return nextProvider.scrape().catch(error => { - console.warn(`Failed ${nextProvider.NAME} scraping due: `, error); - return Promise.resolve() - }); - }, Promise.resolve()); -} - -async function updateSeeders() { - return getUpdateSeedersTorrents() - .then(torrents => Promise.all(torrents - .map(torrent => PROVIDERS.find(provider => provider.NAME === torrent.provider) - .updateSeeders(torrent)))) - .then(() => console.log('Finished updating seeders')); -} - -function enableScheduling() { - if (process.env.ENABLE_SCHEDULING) { - schedule.scheduleJob(SCRAPE_CRON, - () => scrape().catch(error => console.error('Failed scraping: ', error))); - // schedule.scheduleJob(SEEDERS_CRON, - // () => updateSeeders().catch(error => console.error('Failed update seeders: ', error))); - } else { - scrape().catch(error => console.error('Failed scraping: ', error)); - } -} +const { connect } = require('./lib/repository'); +const { startScraper } = require('./scheduler/scheduler') server.get('/', function (req, res) { res.sendStatus(200); @@ -68,5 +10,5 @@ server.get('/', function (req, res) { server.listen(process.env.PORT || 7000, async () => { await connect(); console.log('Scraper started'); - enableScheduling(); + startScraper(); }); \ No newline at end of file diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index 79014df..801564a 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -18,8 +18,9 @@ module.exports.updateCurrentSeeders = function (torrent) { const Tracker = require("peer-search/tracker"); const seeders = {}; - const decodedMagnetLink = torrent.magnetLink && decode(torrent.magnetLink); - const trackers = decodedMagnetLink && decodedMagnetLink.tr || torrent.trackers || await getDefaultTrackers(); + const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr; + const torrentTrackers = torrent.trackers && torrent.trackers.split(','); + const trackers = magnetTrackers || torrentTrackers || await getDefaultTrackers(); const callback = () => resolve(Math.max(...Object.values(seeders).map(values => values[0]).concat(0))); setTimeout(callback, SEEDS_CHECK_TIMEOUT); diff --git a/scraper/lib/torrentEntries.js b/scraper/lib/torrentEntries.js index ab8bfb8..bdd31a7 100644 --- a/scraper/lib/torrentEntries.js +++ b/scraper/lib/torrentEntries.js @@ -29,20 +29,20 @@ async function createTorrentEntry(torrent, overwrite = false) { } if (!torrent.imdbId && !torrent.kitsuId && !titleInfo.complete && typeof titleInfo.year !== 'string') { - console.log(`imdbId or kitsuId not found: ${torrent.title}`); + console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`); return; } const files = await parseTorrentFiles(torrent) .then(files => overwrite ? overwriteExistingFiles(torrent, files) : files); if (!files || !files.length) { - console.log(`no video files found for [${torrent.infoHash}] ${torrent.title}`); + console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`); return; } return repository.createTorrent(torrent) .then(() => Promise.all(files.map(file => repository.createFile(file)))) - .then(() => console.log(`Created entry for [${torrent.infoHash}] ${torrent.title}`)); + .then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`)); } async function overwriteExistingFiles(torrent, files) { diff --git a/scraper/manual/manual.js b/scraper/manual/manual.js index 5dc8716..18e0936 100644 --- a/scraper/manual/manual.js +++ b/scraper/manual/manual.js @@ -1,4 +1,3 @@ -require('dotenv').config(); const Bottleneck = require('bottleneck'); const { parse } = require('parse-torrent-title'); const repository = require('../lib/repository'); diff --git a/scraper/scheduler/scheduler.js b/scraper/scheduler/scheduler.js new file mode 100644 index 0000000..fd4563d --- /dev/null +++ b/scraper/scheduler/scheduler.js @@ -0,0 +1,13 @@ +const { scheduleScraping, scrapeAll } = require('./scraper') +const { scheduleUpdateSeeders } = require('./seeders') + +function startScraper() { + if (process.env.ENABLE_SCHEDULING) { + scheduleScraping(); + scheduleUpdateSeeders(); + } else { + scrapeAll() + } +} + +module.exports = { startScraper } \ No newline at end of file diff --git a/scraper/scheduler/scraper.js b/scraper/scheduler/scraper.js new file mode 100644 index 0000000..c6e836c --- /dev/null +++ b/scraper/scheduler/scraper.js @@ -0,0 +1,25 @@ +const scrapers = require('./scrapers'); +const { delay, sequence } = require('../lib/promises') + +function scheduleScraping() { + return scrapers.forEach(provider => _continuousScrape(provider)) +} + +function scrapeAll() { + return sequence(scrapers.map(provider => () => _singleScrape(provider))) +} + +async function _continuousScrape(provider) { + return _singleScrape(provider) + .then(() => delay(provider.scrapeInterval)) + .then(() => _continuousScrape(provider)) +} + +async function _singleScrape(provider) { + return provider.scraper.scrape().catch(error => { + console.warn(`Failed ${provider.name} scraping due: `, error); + return Promise.resolve() + }) +} + +module.exports = { scheduleScraping, scrapeAll } \ No newline at end of file diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js new file mode 100644 index 0000000..175ab91 --- /dev/null +++ b/scraper/scheduler/scrapers.js @@ -0,0 +1,22 @@ +const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper'); +const horribleSubsScraper = require('../scrapers/horriblesubs/horriblesubs_scraper'); +const ytsScraper = require('../scrapers/yts/yts_scraper'); +const eztvScraper = require('../scrapers/eztv/eztv_scraper'); +const leetxScraper = require('../scrapers/1337x/1337x_scraper'); +const kickassScraper = require('../scrapers/kickass/kickass_scraper'); +const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper'); + +module.exports = [ + { scraper: ytsScraper, name: ytsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + { scraper: eztvScraper, name: eztvScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + { scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + { scraper: rarbgScraper, name: rarbgScraper.NAME, scrapeInterval: 2 * 60 * 60 * 1000 }, + { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + { scraper: kickassScraper, name: kickassScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + { scraper: leetxScraper, name: leetxScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 }, + // { scraper: require('../scrapers/1337x/1337x_dump_scraper') } + // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') } + // { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') } + // { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') } + // { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') } +]; \ No newline at end of file diff --git a/scraper/scheduler/seeders.js b/scraper/scheduler/seeders.js new file mode 100644 index 0000000..816b0ee --- /dev/null +++ b/scraper/scheduler/seeders.js @@ -0,0 +1,41 @@ +const Bottleneck = require('bottleneck'); +const scrapers = require('./scrapers'); +const repository = require('../lib/repository') +const { delay } = require('../lib/promises') +const { updateCurrentSeeders } = require('../lib/torrent') +const { updateTorrentSeeders } = require('../lib/torrentEntries') + +const DELAY = 15 * 1000; // 15 seconds +const limiter = new Bottleneck({ maxConcurrent: 20, minTime: 250 }); +const forceSeedersLimiter = new Bottleneck({ maxConcurrent: 5 }); + +function scheduleUpdateSeeders() { + console.log('Starting seeders update...') + return repository.getUpdateSeedersTorrents() + .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => _updateSeeders(torrent))))) + .then(() => console.log('Finished seeders update')) + .then(() => delay(DELAY)) + .then(() => scheduleUpdateSeeders()); +} + +async function _updateSeeders(torrent) { + const provider = await scrapers.find(provider => provider.name === torrent.provider); + const updatedTorrents = await provider.scraper.updateSeeders(torrent, getImdbIdsMethod(torrent)) + .then(updated => Array.isArray(updated) ? updated : [updated]) + .catch(() => []); + + if (!updatedTorrents.find(updated => updated.infoHash === torrent.infoHash)) { + await forceSeedersLimiter.schedule(() => updateCurrentSeeders(torrent)) + .then(updated => updatedTorrents.push(updated)); + } + + return Promise.all(updatedTorrents.map(updated => updateTorrentSeeders(updated))) +} + +async function getImdbIdsMethod(torrent) { + return () => repository.getFiles(torrent) + .then(files => files.map(file => file.imdbId).filter(id => id)) + .then(ids => Array.from(new Set(ids))); +} + +module.exports = { scheduleUpdateSeeders } \ No newline at end of file diff --git a/scraper/scrapers/1337x/1337x_scraper.js b/scraper/scrapers/1337x/1337x_scraper.js index c688a9c..16d01c5 100644 --- a/scraper/scrapers/1337x/1337x_scraper.js +++ b/scraper/scrapers/1337x/1337x_scraper.js @@ -27,10 +27,7 @@ async function scrape() { } async function updateSeeders(torrent) { - return limiter.schedule(() => leetx.torrent(torrent.torrentId) - .then(record => (torrent.seeders = record.seeders, torrent)) - .catch(() => updateCurrentSeeders(torrent)) - .then(updated => updateTorrentSeeders(updated))); + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() { diff --git a/scraper/scrapers/eztv/eztv_api.js b/scraper/scrapers/eztv/eztv_api.js index e561770..26e1595 100644 --- a/scraper/scrapers/eztv/eztv_api.js +++ b/scraper/scrapers/eztv/eztv_api.js @@ -33,7 +33,7 @@ function search(imdbId, config = {}, retries = 2) { return Promises.first(defaultProxies .map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config))) .then(results => parseResults(results)) - .then(torrents => torrents.length === limit && page < maxPage + .then(torrents => torrents.length === limit && page < maxPage && !torrents.find(t => t.imdbId === imdbId) ? search(imdbId, { ...config, page: page + 1 }) .catch(() => []) .then(nextTorrents => torrents.concat(nextTorrents)) diff --git a/scraper/scrapers/eztv/eztv_scraper.js b/scraper/scrapers/eztv/eztv_scraper.js index bc636f5..1c182f2 100644 --- a/scraper/scrapers/eztv/eztv_scraper.js +++ b/scraper/scrapers/eztv/eztv_scraper.js @@ -4,14 +4,12 @@ const eztv = require('./eztv_api'); const { Type } = require('../../lib/types'); const Promises = require('../../lib/promises'); const repository = require('../../lib/repository'); - -const { updateCurrentSeeders } = require('../../lib/torrent'); const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries'); const NAME = 'EZTV'; const UNTIL_PAGE = 10; -const limiter = new Bottleneck({ maxConcurrent: 20 }); +const limiter = new Bottleneck({ maxConcurrent: 1 }); async function scrape() { const scrapeStart = moment(); @@ -26,11 +24,9 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } -async function updateSeeders(torrent) { - return limiter.schedule(() => eztv.torrent(torrent.torrentId) - .then(record => (torrent.seeders = record.seeders, torrent)) - .catch(() => updateCurrentSeeders(torrent)) - .then(updated => updateTorrentSeeders(updated))); +async function updateSeeders(torrent, getImdbIdsMethod) { + return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds + .map(imdbId => limiter.schedule(() => eztv.search(imdbId))))); } async function scrapeLatestTorrents() { @@ -45,7 +41,7 @@ async function scrapeLatestTorrentsForCategory(page = 1) { // return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page)) return Promise.resolve([]); }) - .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) + .then(torrents => Promise.all(torrents.map(torrent => processTorrentRecord(torrent)))) .then(resolved => resolved.length > 0 && page < UNTIL_PAGE ? scrapeLatestTorrentsForCategory(page + 1) : Promise.resolve()); diff --git a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js index f9ff9a7..ee05485 100644 --- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js @@ -37,8 +37,7 @@ async function scrape() { } async function updateSeeders(torrent) { - return entryLimiter.schedule(() => updateCurrentSeeders(torrent) - .then(updated => updateTorrentSeeders(updated))); + return Promise.resolve([]); } async function _scrapeLatestEntries() { diff --git a/scraper/scrapers/kickass/kickass_scraper.js b/scraper/scrapers/kickass/kickass_scraper.js index e9c5311..d92bb42 100644 --- a/scraper/scrapers/kickass/kickass_scraper.js +++ b/scraper/scrapers/kickass/kickass_scraper.js @@ -27,10 +27,7 @@ async function scrape() { } async function updateSeeders(torrent) { - return limiter.schedule(() => kickass.torrent(torrent.torrentId) - .then(record => (torrent.seeders = record.seeders, torrent)) - .catch(() => updateCurrentSeeders(torrent)) - .then(updated => updateTorrentSeeders(updated))); + return limiter.schedule(() => kickass.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() { diff --git a/scraper/scrapers/rarbg/rarbg_scraper.js b/scraper/scrapers/rarbg/rarbg_scraper.js index 5efcd66..d4d6a23 100644 --- a/scraper/scrapers/rarbg/rarbg_scraper.js +++ b/scraper/scrapers/rarbg/rarbg_scraper.js @@ -26,15 +26,9 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } -async function updateSeeders(torrent) { - const imdbIds = await repository.getFiles(torrent) - .then(files => files.map(file => file.imdbId)) - .then(ids => Array.from(new Set(ids))); - - return Promise.all(imdbIds.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb')))) - .then(results => results.reduce((a, b) => a.concat(b), [])) - .then(results => results.map(result => toTorrent(result))) - .then(torrents => Promise.all(torrents.map(updated => updateTorrentSeeders(updated)))); +async function updateSeeders(torrent, getImdbIdsMethod) { + return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds + .map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb'))))); } async function scrapeLatestTorrents() { diff --git a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js index 7cb4260..4844115 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js @@ -39,10 +39,7 @@ async function scrape() { } async function updateSeeders(torrent) { - return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId) - .then(record => (torrent.seeders = record.seeders, torrent)) - .catch(() => updateCurrentSeeders(torrent)) - .then(updated => updateTorrentSeeders(updated))); + return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() { diff --git a/scraper/scrapers/yts/yts_scraper.js b/scraper/scrapers/yts/yts_scraper.js index 25d9784..a169ae3 100644 --- a/scraper/scrapers/yts/yts_scraper.js +++ b/scraper/scrapers/yts/yts_scraper.js @@ -24,9 +24,7 @@ async function scrape() { } async function updateSeeders(torrent) { - return limiter.schedule(() => yts.torrent(torrent.torrentId) - .then(records => records.map(record => ({ ...record, provider: NAME }))) - .then(records => Promise.all(records.map(record => updateTorrentSeeders(record))))); + return limiter.schedule(() => yts.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() {