[scraper] rework scraper scheduling and added seeders updating

This commit is contained in:
TheBeastLT
2020-04-23 16:33:08 +02:00
parent d01e2c9b35
commit 9ea3932af1
19 changed files with 128 additions and 108 deletions

4
package-lock.json generated
View File

@@ -1676,8 +1676,8 @@
}
},
"parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
"requires": {
"moment": "^2.24.0"
}

View File

@@ -17,7 +17,6 @@
"cache-manager": "^2.9.0",
"cache-manager-mongodb": "^0.2.1",
"cheerio": "^0.22.0",
"dotenv": "^8.2.0",
"express": "^4.16.4",
"google-it": "^1.5.0",
"he": "^1.2.0",
@@ -31,7 +30,7 @@
"node-schedule": "^1.3.2",
"nodejs-bing": "^0.1.0",
"parse-torrent": "^6.1.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
"peer-search": "^0.6.x",
"pg": "^7.8.2",
"pg-hstore": "^2.3.2",

View File

@@ -20,7 +20,7 @@ Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/
Movies sitemap
```json
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4110]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width]","multiple":true,"regex":"tt[0-9]+","delay":0}]}
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
```
TV sitemap

View File

@@ -1,65 +1,7 @@
require('dotenv').config();
const express = require("express");
const server = express();
const schedule = require('node-schedule');
const { connect, getUpdateSeedersTorrents } = require('./lib/repository');
const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper');
const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
const ytsScraper = require('./scrapers/yts/yts_scraper');
const eztvScraper = require('./scrapers/eztv/eztv_scraper');
const leetxScraper = require('./scrapers/1337x/1337x_scraper');
const kickassScraper = require('./scrapers/kickass/kickass_scraper');
const rarbgScraper = require('./scrapers/rarbg/rarbg_scraper');
const rarbgDumpScraper = require('./scrapers/rarbg/rarbg_dump_scraper');
const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper');
const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper');
const PROVIDERS = [
// require('./scrapers/thepiratebay/thepiratebay_update_size_scraper')
// require('./scrapers/1337x/1337x_dump_scraper')
ytsScraper,
eztvScraper,
horribleSubsScraper,
rarbgScraper,
thepiratebayScraper,
kickassScraper,
leetxScraper
// rarbgDumpScraper
// thepiratebayDumpScraper
// thepiratebayUnofficialDumpScraper
];
const SCRAPE_CRON = process.env.SCRAPE_CRON || '0 0 */4 ? * *'; // every 4 hours
const SEEDERS_CRON = '0 */2 * ? * *'; // every 2 minutes
async function scrape() {
return PROVIDERS
.reduce(async (previousPromise, nextProvider) => {
await previousPromise;
return nextProvider.scrape().catch(error => {
console.warn(`Failed ${nextProvider.NAME} scraping due: `, error);
return Promise.resolve()
});
}, Promise.resolve());
}
async function updateSeeders() {
return getUpdateSeedersTorrents()
.then(torrents => Promise.all(torrents
.map(torrent => PROVIDERS.find(provider => provider.NAME === torrent.provider)
.updateSeeders(torrent))))
.then(() => console.log('Finished updating seeders'));
}
function enableScheduling() {
if (process.env.ENABLE_SCHEDULING) {
schedule.scheduleJob(SCRAPE_CRON,
() => scrape().catch(error => console.error('Failed scraping: ', error)));
// schedule.scheduleJob(SEEDERS_CRON,
// () => updateSeeders().catch(error => console.error('Failed update seeders: ', error)));
} else {
scrape().catch(error => console.error('Failed scraping: ', error));
}
}
const { connect } = require('./lib/repository');
const { startScraper } = require('./scheduler/scheduler')
server.get('/', function (req, res) {
res.sendStatus(200);
@@ -68,5 +10,5 @@ server.get('/', function (req, res) {
server.listen(process.env.PORT || 7000, async () => {
await connect();
console.log('Scraper started');
enableScheduling();
startScraper();
});

View File

@@ -18,8 +18,9 @@ module.exports.updateCurrentSeeders = function (torrent) {
const Tracker = require("peer-search/tracker");
const seeders = {};
const decodedMagnetLink = torrent.magnetLink && decode(torrent.magnetLink);
const trackers = decodedMagnetLink && decodedMagnetLink.tr || torrent.trackers || await getDefaultTrackers();
const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr;
const torrentTrackers = torrent.trackers && torrent.trackers.split(',');
const trackers = magnetTrackers || torrentTrackers || await getDefaultTrackers();
const callback = () => resolve(Math.max(...Object.values(seeders).map(values => values[0]).concat(0)));
setTimeout(callback, SEEDS_CHECK_TIMEOUT);

View File

@@ -29,20 +29,20 @@ async function createTorrentEntry(torrent, overwrite = false) {
}
if (!torrent.imdbId && !torrent.kitsuId && !titleInfo.complete && typeof titleInfo.year !== 'string') {
console.log(`imdbId or kitsuId not found: ${torrent.title}`);
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
return;
}
const files = await parseTorrentFiles(torrent)
.then(files => overwrite ? overwriteExistingFiles(torrent, files) : files);
if (!files || !files.length) {
console.log(`no video files found for [${torrent.infoHash}] ${torrent.title}`);
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
return;
}
return repository.createTorrent(torrent)
.then(() => Promise.all(files.map(file => repository.createFile(file))))
.then(() => console.log(`Created entry for [${torrent.infoHash}] ${torrent.title}`));
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
}
async function overwriteExistingFiles(torrent, files) {

View File

@@ -1,4 +1,3 @@
require('dotenv').config();
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const repository = require('../lib/repository');

View File

@@ -0,0 +1,13 @@
const { scheduleScraping, scrapeAll } = require('./scraper')
const { scheduleUpdateSeeders } = require('./seeders')
function startScraper() {
if (process.env.ENABLE_SCHEDULING) {
scheduleScraping();
scheduleUpdateSeeders();
} else {
scrapeAll()
}
}
module.exports = { startScraper }

View File

@@ -0,0 +1,25 @@
const scrapers = require('./scrapers');
const { delay, sequence } = require('../lib/promises')
function scheduleScraping() {
return scrapers.forEach(provider => _continuousScrape(provider))
}
function scrapeAll() {
return sequence(scrapers.map(provider => () => _singleScrape(provider)))
}
async function _continuousScrape(provider) {
return _singleScrape(provider)
.then(() => delay(provider.scrapeInterval))
.then(() => _continuousScrape(provider))
}
async function _singleScrape(provider) {
return provider.scraper.scrape().catch(error => {
console.warn(`Failed ${provider.name} scraping due: `, error);
return Promise.resolve()
})
}
module.exports = { scheduleScraping, scrapeAll }

View File

@@ -0,0 +1,22 @@
const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper');
const horribleSubsScraper = require('../scrapers/horriblesubs/horriblesubs_scraper');
const ytsScraper = require('../scrapers/yts/yts_scraper');
const eztvScraper = require('../scrapers/eztv/eztv_scraper');
const leetxScraper = require('../scrapers/1337x/1337x_scraper');
const kickassScraper = require('../scrapers/kickass/kickass_scraper');
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
module.exports = [
{ scraper: ytsScraper, name: ytsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
{ scraper: eztvScraper, name: eztvScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
{ scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
{ scraper: rarbgScraper, name: rarbgScraper.NAME, scrapeInterval: 2 * 60 * 60 * 1000 },
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
{ scraper: kickassScraper, name: kickassScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
{ scraper: leetxScraper, name: leetxScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
// { scraper: require('../scrapers/1337x/1337x_dump_scraper') }
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') }
];

View File

@@ -0,0 +1,41 @@
const Bottleneck = require('bottleneck');
const scrapers = require('./scrapers');
const repository = require('../lib/repository')
const { delay } = require('../lib/promises')
const { updateCurrentSeeders } = require('../lib/torrent')
const { updateTorrentSeeders } = require('../lib/torrentEntries')
const DELAY = 15 * 1000; // 15 seconds
const limiter = new Bottleneck({ maxConcurrent: 20, minTime: 250 });
const forceSeedersLimiter = new Bottleneck({ maxConcurrent: 5 });
function scheduleUpdateSeeders() {
console.log('Starting seeders update...')
return repository.getUpdateSeedersTorrents()
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => _updateSeeders(torrent)))))
.then(() => console.log('Finished seeders update'))
.then(() => delay(DELAY))
.then(() => scheduleUpdateSeeders());
}
async function _updateSeeders(torrent) {
const provider = await scrapers.find(provider => provider.name === torrent.provider);
const updatedTorrents = await provider.scraper.updateSeeders(torrent, getImdbIdsMethod(torrent))
.then(updated => Array.isArray(updated) ? updated : [updated])
.catch(() => []);
if (!updatedTorrents.find(updated => updated.infoHash === torrent.infoHash)) {
await forceSeedersLimiter.schedule(() => updateCurrentSeeders(torrent))
.then(updated => updatedTorrents.push(updated));
}
return Promise.all(updatedTorrents.map(updated => updateTorrentSeeders(updated)))
}
async function getImdbIdsMethod(torrent) {
return () => repository.getFiles(torrent)
.then(files => files.map(file => file.imdbId).filter(id => id))
.then(ids => Array.from(new Set(ids)));
}
module.exports = { scheduleUpdateSeeders }

View File

@@ -27,10 +27,7 @@ async function scrape() {
}
async function updateSeeders(torrent) {
return limiter.schedule(() => leetx.torrent(torrent.torrentId)
.then(record => (torrent.seeders = record.seeders, torrent))
.catch(() => updateCurrentSeeders(torrent))
.then(updated => updateTorrentSeeders(updated)));
return limiter.schedule(() => leetx.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {

View File

@@ -33,7 +33,7 @@ function search(imdbId, config = {}, retries = 2) {
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config)))
.then(results => parseResults(results))
.then(torrents => torrents.length === limit && page < maxPage
.then(torrents => torrents.length === limit && page < maxPage && !torrents.find(t => t.imdbId === imdbId)
? search(imdbId, { ...config, page: page + 1 })
.catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))

View File

@@ -4,14 +4,12 @@ const eztv = require('./eztv_api');
const { Type } = require('../../lib/types');
const Promises = require('../../lib/promises');
const repository = require('../../lib/repository');
const { updateCurrentSeeders } = require('../../lib/torrent');
const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries');
const NAME = 'EZTV';
const UNTIL_PAGE = 10;
const limiter = new Bottleneck({ maxConcurrent: 20 });
const limiter = new Bottleneck({ maxConcurrent: 1 });
async function scrape() {
const scrapeStart = moment();
@@ -26,11 +24,9 @@ async function scrape() {
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => eztv.torrent(torrent.torrentId)
.then(record => (torrent.seeders = record.seeders, torrent))
.catch(() => updateCurrentSeeders(torrent))
.then(updated => updateTorrentSeeders(updated)));
async function updateSeeders(torrent, getImdbIdsMethod) {
return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
.map(imdbId => limiter.schedule(() => eztv.search(imdbId)))));
}
async function scrapeLatestTorrents() {
@@ -45,7 +41,7 @@ async function scrapeLatestTorrentsForCategory(page = 1) {
// return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page))
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(torrents => Promise.all(torrents.map(torrent => processTorrentRecord(torrent))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(page + 1)
: Promise.resolve());

View File

@@ -37,8 +37,7 @@ async function scrape() {
}
async function updateSeeders(torrent) {
return entryLimiter.schedule(() => updateCurrentSeeders(torrent)
.then(updated => updateTorrentSeeders(updated)));
return Promise.resolve([]);
}
async function _scrapeLatestEntries() {

View File

@@ -27,10 +27,7 @@ async function scrape() {
}
async function updateSeeders(torrent) {
return limiter.schedule(() => kickass.torrent(torrent.torrentId)
.then(record => (torrent.seeders = record.seeders, torrent))
.catch(() => updateCurrentSeeders(torrent))
.then(updated => updateTorrentSeeders(updated)));
return limiter.schedule(() => kickass.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {

View File

@@ -26,15 +26,9 @@ async function scrape() {
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
const imdbIds = await repository.getFiles(torrent)
.then(files => files.map(file => file.imdbId))
.then(ids => Array.from(new Set(ids)));
return Promise.all(imdbIds.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb'))))
.then(results => results.reduce((a, b) => a.concat(b), []))
.then(results => results.map(result => toTorrent(result)))
.then(torrents => Promise.all(torrents.map(updated => updateTorrentSeeders(updated))));
async function updateSeeders(torrent, getImdbIdsMethod) {
return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb')))));
}
async function scrapeLatestTorrents() {

View File

@@ -39,10 +39,7 @@ async function scrape() {
}
async function updateSeeders(torrent) {
return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId)
.then(record => (torrent.seeders = record.seeders, torrent))
.catch(() => updateCurrentSeeders(torrent))
.then(updated => updateTorrentSeeders(updated)));
return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {

View File

@@ -24,9 +24,7 @@ async function scrape() {
}
async function updateSeeders(torrent) {
return limiter.schedule(() => yts.torrent(torrent.torrentId)
.then(records => records.map(record => ({ ...record, provider: NAME })))
.then(records => Promise.all(records.map(record => updateTorrentSeeders(record)))));
return limiter.schedule(() => yts.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {