mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
[scraper] rework scraper scheduling and added seeders updating
This commit is contained in:
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1676,8 +1676,8 @@
|
||||
}
|
||||
},
|
||||
"parse-torrent-title": {
|
||||
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
|
||||
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
|
||||
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
|
||||
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
|
||||
"requires": {
|
||||
"moment": "^2.24.0"
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
"cache-manager": "^2.9.0",
|
||||
"cache-manager-mongodb": "^0.2.1",
|
||||
"cheerio": "^0.22.0",
|
||||
"dotenv": "^8.2.0",
|
||||
"express": "^4.16.4",
|
||||
"google-it": "^1.5.0",
|
||||
"he": "^1.2.0",
|
||||
@@ -31,7 +30,7 @@
|
||||
"node-schedule": "^1.3.2",
|
||||
"nodejs-bing": "^0.1.0",
|
||||
"parse-torrent": "^6.1.2",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
|
||||
"peer-search": "^0.6.x",
|
||||
"pg": "^7.8.2",
|
||||
"pg-hstore": "^2.3.2",
|
||||
|
||||
@@ -20,7 +20,7 @@ Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/
|
||||
|
||||
Movies sitemap
|
||||
```json
|
||||
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4110]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width]","multiple":true,"regex":"tt[0-9]+","delay":0}]}
|
||||
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
|
||||
```
|
||||
|
||||
TV sitemap
|
||||
|
||||
@@ -1,65 +1,7 @@
|
||||
require('dotenv').config();
|
||||
const express = require("express");
|
||||
const server = express();
|
||||
const schedule = require('node-schedule');
|
||||
const { connect, getUpdateSeedersTorrents } = require('./lib/repository');
|
||||
const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper');
|
||||
const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
|
||||
const ytsScraper = require('./scrapers/yts/yts_scraper');
|
||||
const eztvScraper = require('./scrapers/eztv/eztv_scraper');
|
||||
const leetxScraper = require('./scrapers/1337x/1337x_scraper');
|
||||
const kickassScraper = require('./scrapers/kickass/kickass_scraper');
|
||||
const rarbgScraper = require('./scrapers/rarbg/rarbg_scraper');
|
||||
const rarbgDumpScraper = require('./scrapers/rarbg/rarbg_dump_scraper');
|
||||
const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper');
|
||||
const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper');
|
||||
|
||||
const PROVIDERS = [
|
||||
// require('./scrapers/thepiratebay/thepiratebay_update_size_scraper')
|
||||
// require('./scrapers/1337x/1337x_dump_scraper')
|
||||
ytsScraper,
|
||||
eztvScraper,
|
||||
horribleSubsScraper,
|
||||
rarbgScraper,
|
||||
thepiratebayScraper,
|
||||
kickassScraper,
|
||||
leetxScraper
|
||||
// rarbgDumpScraper
|
||||
// thepiratebayDumpScraper
|
||||
// thepiratebayUnofficialDumpScraper
|
||||
];
|
||||
const SCRAPE_CRON = process.env.SCRAPE_CRON || '0 0 */4 ? * *'; // every 4 hours
|
||||
const SEEDERS_CRON = '0 */2 * ? * *'; // every 2 minutes
|
||||
|
||||
async function scrape() {
|
||||
return PROVIDERS
|
||||
.reduce(async (previousPromise, nextProvider) => {
|
||||
await previousPromise;
|
||||
return nextProvider.scrape().catch(error => {
|
||||
console.warn(`Failed ${nextProvider.NAME} scraping due: `, error);
|
||||
return Promise.resolve()
|
||||
});
|
||||
}, Promise.resolve());
|
||||
}
|
||||
|
||||
async function updateSeeders() {
|
||||
return getUpdateSeedersTorrents()
|
||||
.then(torrents => Promise.all(torrents
|
||||
.map(torrent => PROVIDERS.find(provider => provider.NAME === torrent.provider)
|
||||
.updateSeeders(torrent))))
|
||||
.then(() => console.log('Finished updating seeders'));
|
||||
}
|
||||
|
||||
function enableScheduling() {
|
||||
if (process.env.ENABLE_SCHEDULING) {
|
||||
schedule.scheduleJob(SCRAPE_CRON,
|
||||
() => scrape().catch(error => console.error('Failed scraping: ', error)));
|
||||
// schedule.scheduleJob(SEEDERS_CRON,
|
||||
// () => updateSeeders().catch(error => console.error('Failed update seeders: ', error)));
|
||||
} else {
|
||||
scrape().catch(error => console.error('Failed scraping: ', error));
|
||||
}
|
||||
}
|
||||
const { connect } = require('./lib/repository');
|
||||
const { startScraper } = require('./scheduler/scheduler')
|
||||
|
||||
server.get('/', function (req, res) {
|
||||
res.sendStatus(200);
|
||||
@@ -68,5 +10,5 @@ server.get('/', function (req, res) {
|
||||
server.listen(process.env.PORT || 7000, async () => {
|
||||
await connect();
|
||||
console.log('Scraper started');
|
||||
enableScheduling();
|
||||
startScraper();
|
||||
});
|
||||
@@ -18,8 +18,9 @@ module.exports.updateCurrentSeeders = function (torrent) {
|
||||
const Tracker = require("peer-search/tracker");
|
||||
|
||||
const seeders = {};
|
||||
const decodedMagnetLink = torrent.magnetLink && decode(torrent.magnetLink);
|
||||
const trackers = decodedMagnetLink && decodedMagnetLink.tr || torrent.trackers || await getDefaultTrackers();
|
||||
const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr;
|
||||
const torrentTrackers = torrent.trackers && torrent.trackers.split(',');
|
||||
const trackers = magnetTrackers || torrentTrackers || await getDefaultTrackers();
|
||||
const callback = () => resolve(Math.max(...Object.values(seeders).map(values => values[0]).concat(0)));
|
||||
setTimeout(callback, SEEDS_CHECK_TIMEOUT);
|
||||
|
||||
|
||||
@@ -29,20 +29,20 @@ async function createTorrentEntry(torrent, overwrite = false) {
|
||||
}
|
||||
|
||||
if (!torrent.imdbId && !torrent.kitsuId && !titleInfo.complete && typeof titleInfo.year !== 'string') {
|
||||
console.log(`imdbId or kitsuId not found: ${torrent.title}`);
|
||||
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const files = await parseTorrentFiles(torrent)
|
||||
.then(files => overwrite ? overwriteExistingFiles(torrent, files) : files);
|
||||
if (!files || !files.length) {
|
||||
console.log(`no video files found for [${torrent.infoHash}] ${torrent.title}`);
|
||||
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
|
||||
return;
|
||||
}
|
||||
|
||||
return repository.createTorrent(torrent)
|
||||
.then(() => Promise.all(files.map(file => repository.createFile(file))))
|
||||
.then(() => console.log(`Created entry for [${torrent.infoHash}] ${torrent.title}`));
|
||||
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
|
||||
}
|
||||
|
||||
async function overwriteExistingFiles(torrent, files) {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
require('dotenv').config();
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const repository = require('../lib/repository');
|
||||
|
||||
13
scraper/scheduler/scheduler.js
Normal file
13
scraper/scheduler/scheduler.js
Normal file
@@ -0,0 +1,13 @@
|
||||
const { scheduleScraping, scrapeAll } = require('./scraper')
|
||||
const { scheduleUpdateSeeders } = require('./seeders')
|
||||
|
||||
function startScraper() {
|
||||
if (process.env.ENABLE_SCHEDULING) {
|
||||
scheduleScraping();
|
||||
scheduleUpdateSeeders();
|
||||
} else {
|
||||
scrapeAll()
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { startScraper }
|
||||
25
scraper/scheduler/scraper.js
Normal file
25
scraper/scheduler/scraper.js
Normal file
@@ -0,0 +1,25 @@
|
||||
const scrapers = require('./scrapers');
|
||||
const { delay, sequence } = require('../lib/promises')
|
||||
|
||||
function scheduleScraping() {
|
||||
return scrapers.forEach(provider => _continuousScrape(provider))
|
||||
}
|
||||
|
||||
function scrapeAll() {
|
||||
return sequence(scrapers.map(provider => () => _singleScrape(provider)))
|
||||
}
|
||||
|
||||
async function _continuousScrape(provider) {
|
||||
return _singleScrape(provider)
|
||||
.then(() => delay(provider.scrapeInterval))
|
||||
.then(() => _continuousScrape(provider))
|
||||
}
|
||||
|
||||
async function _singleScrape(provider) {
|
||||
return provider.scraper.scrape().catch(error => {
|
||||
console.warn(`Failed ${provider.name} scraping due: `, error);
|
||||
return Promise.resolve()
|
||||
})
|
||||
}
|
||||
|
||||
module.exports = { scheduleScraping, scrapeAll }
|
||||
22
scraper/scheduler/scrapers.js
Normal file
22
scraper/scheduler/scrapers.js
Normal file
@@ -0,0 +1,22 @@
|
||||
const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper');
|
||||
const horribleSubsScraper = require('../scrapers/horriblesubs/horriblesubs_scraper');
|
||||
const ytsScraper = require('../scrapers/yts/yts_scraper');
|
||||
const eztvScraper = require('../scrapers/eztv/eztv_scraper');
|
||||
const leetxScraper = require('../scrapers/1337x/1337x_scraper');
|
||||
const kickassScraper = require('../scrapers/kickass/kickass_scraper');
|
||||
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
|
||||
|
||||
module.exports = [
|
||||
{ scraper: ytsScraper, name: ytsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
{ scraper: eztvScraper, name: eztvScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
{ scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
{ scraper: rarbgScraper, name: rarbgScraper.NAME, scrapeInterval: 2 * 60 * 60 * 1000 },
|
||||
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
{ scraper: kickassScraper, name: kickassScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
{ scraper: leetxScraper, name: leetxScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
|
||||
// { scraper: require('../scrapers/1337x/1337x_dump_scraper') }
|
||||
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') }
|
||||
];
|
||||
41
scraper/scheduler/seeders.js
Normal file
41
scraper/scheduler/seeders.js
Normal file
@@ -0,0 +1,41 @@
|
||||
const Bottleneck = require('bottleneck');
|
||||
const scrapers = require('./scrapers');
|
||||
const repository = require('../lib/repository')
|
||||
const { delay } = require('../lib/promises')
|
||||
const { updateCurrentSeeders } = require('../lib/torrent')
|
||||
const { updateTorrentSeeders } = require('../lib/torrentEntries')
|
||||
|
||||
const DELAY = 15 * 1000; // 15 seconds
|
||||
const limiter = new Bottleneck({ maxConcurrent: 20, minTime: 250 });
|
||||
const forceSeedersLimiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
function scheduleUpdateSeeders() {
|
||||
console.log('Starting seeders update...')
|
||||
return repository.getUpdateSeedersTorrents()
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => _updateSeeders(torrent)))))
|
||||
.then(() => console.log('Finished seeders update'))
|
||||
.then(() => delay(DELAY))
|
||||
.then(() => scheduleUpdateSeeders());
|
||||
}
|
||||
|
||||
async function _updateSeeders(torrent) {
|
||||
const provider = await scrapers.find(provider => provider.name === torrent.provider);
|
||||
const updatedTorrents = await provider.scraper.updateSeeders(torrent, getImdbIdsMethod(torrent))
|
||||
.then(updated => Array.isArray(updated) ? updated : [updated])
|
||||
.catch(() => []);
|
||||
|
||||
if (!updatedTorrents.find(updated => updated.infoHash === torrent.infoHash)) {
|
||||
await forceSeedersLimiter.schedule(() => updateCurrentSeeders(torrent))
|
||||
.then(updated => updatedTorrents.push(updated));
|
||||
}
|
||||
|
||||
return Promise.all(updatedTorrents.map(updated => updateTorrentSeeders(updated)))
|
||||
}
|
||||
|
||||
async function getImdbIdsMethod(torrent) {
|
||||
return () => repository.getFiles(torrent)
|
||||
.then(files => files.map(file => file.imdbId).filter(id => id))
|
||||
.then(ids => Array.from(new Set(ids)));
|
||||
}
|
||||
|
||||
module.exports = { scheduleUpdateSeeders }
|
||||
@@ -27,10 +27,7 @@ async function scrape() {
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => leetx.torrent(torrent.torrentId)
|
||||
.then(record => (torrent.seeders = record.seeders, torrent))
|
||||
.catch(() => updateCurrentSeeders(torrent))
|
||||
.then(updated => updateTorrentSeeders(updated)));
|
||||
return limiter.schedule(() => leetx.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
|
||||
@@ -33,7 +33,7 @@ function search(imdbId, config = {}, retries = 2) {
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config)))
|
||||
.then(results => parseResults(results))
|
||||
.then(torrents => torrents.length === limit && page < maxPage
|
||||
.then(torrents => torrents.length === limit && page < maxPage && !torrents.find(t => t.imdbId === imdbId)
|
||||
? search(imdbId, { ...config, page: page + 1 })
|
||||
.catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
|
||||
@@ -4,14 +4,12 @@ const eztv = require('./eztv_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const Promises = require('../../lib/promises');
|
||||
const repository = require('../../lib/repository');
|
||||
|
||||
const { updateCurrentSeeders } = require('../../lib/torrent');
|
||||
const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'EZTV';
|
||||
const UNTIL_PAGE = 10;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 20 });
|
||||
const limiter = new Bottleneck({ maxConcurrent: 1 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
@@ -26,11 +24,9 @@ async function scrape() {
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => eztv.torrent(torrent.torrentId)
|
||||
.then(record => (torrent.seeders = record.seeders, torrent))
|
||||
.catch(() => updateCurrentSeeders(torrent))
|
||||
.then(updated => updateTorrentSeeders(updated)));
|
||||
async function updateSeeders(torrent, getImdbIdsMethod) {
|
||||
return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
|
||||
.map(imdbId => limiter.schedule(() => eztv.search(imdbId)))));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
@@ -45,7 +41,7 @@ async function scrapeLatestTorrentsForCategory(page = 1) {
|
||||
// return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page))
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(torrents => Promise.all(torrents.map(torrent => processTorrentRecord(torrent))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(page + 1)
|
||||
: Promise.resolve());
|
||||
|
||||
@@ -37,8 +37,7 @@ async function scrape() {
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return entryLimiter.schedule(() => updateCurrentSeeders(torrent)
|
||||
.then(updated => updateTorrentSeeders(updated)));
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function _scrapeLatestEntries() {
|
||||
|
||||
@@ -27,10 +27,7 @@ async function scrape() {
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => kickass.torrent(torrent.torrentId)
|
||||
.then(record => (torrent.seeders = record.seeders, torrent))
|
||||
.catch(() => updateCurrentSeeders(torrent))
|
||||
.then(updated => updateTorrentSeeders(updated)));
|
||||
return limiter.schedule(() => kickass.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
|
||||
@@ -26,15 +26,9 @@ async function scrape() {
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
const imdbIds = await repository.getFiles(torrent)
|
||||
.then(files => files.map(file => file.imdbId))
|
||||
.then(ids => Array.from(new Set(ids)));
|
||||
|
||||
return Promise.all(imdbIds.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb'))))
|
||||
.then(results => results.reduce((a, b) => a.concat(b), []))
|
||||
.then(results => results.map(result => toTorrent(result)))
|
||||
.then(torrents => Promise.all(torrents.map(updated => updateTorrentSeeders(updated))));
|
||||
async function updateSeeders(torrent, getImdbIdsMethod) {
|
||||
return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
|
||||
.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb')))));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
|
||||
@@ -39,10 +39,7 @@ async function scrape() {
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId)
|
||||
.then(record => (torrent.seeders = record.seeders, torrent))
|
||||
.catch(() => updateCurrentSeeders(torrent))
|
||||
.then(updated => updateTorrentSeeders(updated)));
|
||||
return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
|
||||
@@ -24,9 +24,7 @@ async function scrape() {
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => yts.torrent(torrent.torrentId)
|
||||
.then(records => records.map(record => ({ ...record, provider: NAME })))
|
||||
.then(records => Promise.all(records.map(record => updateTorrentSeeders(record)))));
|
||||
return limiter.schedule(() => yts.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
|
||||
Reference in New Issue
Block a user