[scraper] rework scraper scheduling and added seeders updating

2024-12-20 03:29:51 +00:00 · 2020-04-23 16:33:08 +02:00
parent d01e2c9b35
commit 9ea3932af1
19 changed files with 128 additions and 108 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -1676,8 +1676,8 @@
      }
    },
    "parse-torrent-title": {
-      "version": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
-      "from": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
+      "version": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
+      "from": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
      "requires": {
        "moment": "^2.24.0"
      }
--- a/package.json
+++ b/package.json
@@ -17,7 +17,6 @@
    "cache-manager": "^2.9.0",
    "cache-manager-mongodb": "^0.2.1",
    "cheerio": "^0.22.0",
-    "dotenv": "^8.2.0",
    "express": "^4.16.4",
    "google-it": "^1.5.0",
    "he": "^1.2.0",
@@ -31,7 +30,7 @@
    "node-schedule": "^1.3.2",
    "nodejs-bing": "^0.1.0",
    "parse-torrent": "^6.1.2",
-    "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#299b6818907542086ce5d88f11f9359efa145928",
+    "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#9e751ef6e0b4adb31d306d618c92f9feb84963bb",
    "peer-search": "^0.6.x",
    "pg": "^7.8.2",
    "pg-hstore": "^2.3.2",
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -20,7 +20,7 @@ Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/

 Movies sitemap
 ```json
-{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4110]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width]","multiple":true,"regex":"tt[0-9]+","delay":0}]}
+{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
 ```

 TV sitemap
--- a/scraper/index.js
+++ b/scraper/index.js
@@ -1,65 +1,7 @@
-require('dotenv').config();
 const express = require("express");
 const server = express();
-const schedule = require('node-schedule');
-const { connect, getUpdateSeedersTorrents } = require('./lib/repository');
-const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper');
-const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
-const ytsScraper = require('./scrapers/yts/yts_scraper');
-const eztvScraper = require('./scrapers/eztv/eztv_scraper');
-const leetxScraper = require('./scrapers/1337x/1337x_scraper');
-const kickassScraper = require('./scrapers/kickass/kickass_scraper');
-const rarbgScraper = require('./scrapers/rarbg/rarbg_scraper');
-const rarbgDumpScraper = require('./scrapers/rarbg/rarbg_dump_scraper');
-const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper');
-const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper');
-
-const PROVIDERS = [
-  // require('./scrapers/thepiratebay/thepiratebay_update_size_scraper')
-  // require('./scrapers/1337x/1337x_dump_scraper')
-  ytsScraper,
-  eztvScraper,
-  horribleSubsScraper,
-  rarbgScraper,
-  thepiratebayScraper,
-  kickassScraper,
-  leetxScraper
-  // rarbgDumpScraper
-  // thepiratebayDumpScraper
-  // thepiratebayUnofficialDumpScraper
-];
-const SCRAPE_CRON = process.env.SCRAPE_CRON || '0 0 */4 ? * *'; // every 4 hours
-const SEEDERS_CRON = '0 */2 * ? * *'; // every 2 minutes
-
-async function scrape() {
-  return PROVIDERS
-      .reduce(async (previousPromise, nextProvider) => {
-        await previousPromise;
-        return nextProvider.scrape().catch(error => {
-          console.warn(`Failed ${nextProvider.NAME} scraping due: `, error);
-          return Promise.resolve()
-        });
-      }, Promise.resolve());
-}
-
-async function updateSeeders() {
-  return getUpdateSeedersTorrents()
-      .then(torrents => Promise.all(torrents
-          .map(torrent => PROVIDERS.find(provider => provider.NAME === torrent.provider)
-              .updateSeeders(torrent))))
-      .then(() => console.log('Finished updating seeders'));
-}
-
-function enableScheduling() {
-  if (process.env.ENABLE_SCHEDULING) {
-    schedule.scheduleJob(SCRAPE_CRON,
-        () => scrape().catch(error => console.error('Failed scraping: ', error)));
-    // schedule.scheduleJob(SEEDERS_CRON,
-    //     () => updateSeeders().catch(error => console.error('Failed update seeders: ', error)));
-  } else {
-    scrape().catch(error => console.error('Failed scraping: ', error));
-  }
-}
+const { connect } = require('./lib/repository');
+const { startScraper } = require('./scheduler/scheduler')

 server.get('/', function (req, res) {
  res.sendStatus(200);
@@ -68,5 +10,5 @@ server.get('/', function (req, res) {
 server.listen(process.env.PORT || 7000, async () => {
  await connect();
  console.log('Scraper started');
-  enableScheduling();
+  startScraper();
 });
--- a/scraper/lib/torrent.js
+++ b/scraper/lib/torrent.js
@@ -18,8 +18,9 @@ module.exports.updateCurrentSeeders = function (torrent) {
    const Tracker = require("peer-search/tracker");

    const seeders = {};
-    const decodedMagnetLink = torrent.magnetLink && decode(torrent.magnetLink);
-    const trackers = decodedMagnetLink && decodedMagnetLink.tr || torrent.trackers || await getDefaultTrackers();
+    const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr;
+    const torrentTrackers = torrent.trackers && torrent.trackers.split(',');
+    const trackers = magnetTrackers || torrentTrackers || await getDefaultTrackers();
    const callback = () => resolve(Math.max(...Object.values(seeders).map(values => values[0]).concat(0)));
    setTimeout(callback, SEEDS_CHECK_TIMEOUT);

--- a/scraper/lib/torrentEntries.js
+++ b/scraper/lib/torrentEntries.js
@@ -29,20 +29,20 @@ async function createTorrentEntry(torrent, overwrite = false) {
  }

  if (!torrent.imdbId && !torrent.kitsuId && !titleInfo.complete && typeof titleInfo.year !== 'string') {
-    console.log(`imdbId or kitsuId not found: ${torrent.title}`);
+    console.log(`imdbId or kitsuId not found:  ${torrent.provider} ${torrent.title}`);
    return;
  }

  const files = await parseTorrentFiles(torrent)
      .then(files => overwrite ? overwriteExistingFiles(torrent, files) : files);
  if (!files || !files.length) {
-    console.log(`no video files found for [${torrent.infoHash}] ${torrent.title}`);
+    console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
    return;
  }

  return repository.createTorrent(torrent)
      .then(() => Promise.all(files.map(file => repository.createFile(file))))
-      .then(() => console.log(`Created entry for [${torrent.infoHash}] ${torrent.title}`));
+      .then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
 }

 async function overwriteExistingFiles(torrent, files) {
--- a/scraper/manual/manual.js
+++ b/scraper/manual/manual.js
@@ -1,4 +1,3 @@
-require('dotenv').config();
 const Bottleneck = require('bottleneck');
 const { parse } = require('parse-torrent-title');
 const repository = require('../lib/repository');
--- a/scraper/scheduler/scheduler.js
+++ b/scraper/scheduler/scheduler.js
@@ -0,0 +1,13 @@
+const { scheduleScraping, scrapeAll } = require('./scraper')
+const { scheduleUpdateSeeders } = require('./seeders')
+
+function startScraper() {
+  if (process.env.ENABLE_SCHEDULING) {
+    scheduleScraping();
+    scheduleUpdateSeeders();
+  } else {
+    scrapeAll()
+  }
+}
+
+module.exports = { startScraper }
--- a/scraper/scheduler/scraper.js
+++ b/scraper/scheduler/scraper.js
@@ -0,0 +1,25 @@
+const scrapers = require('./scrapers');
+const { delay, sequence } = require('../lib/promises')
+
+function scheduleScraping() {
+  return scrapers.forEach(provider => _continuousScrape(provider))
+}
+
+function scrapeAll() {
+  return sequence(scrapers.map(provider => () => _singleScrape(provider)))
+}
+
+async function _continuousScrape(provider) {
+  return _singleScrape(provider)
+      .then(() => delay(provider.scrapeInterval))
+      .then(() => _continuousScrape(provider))
+}
+
+async function _singleScrape(provider) {
+  return provider.scraper.scrape().catch(error => {
+    console.warn(`Failed ${provider.name} scraping due: `, error);
+    return Promise.resolve()
+  })
+}
+
+module.exports = { scheduleScraping, scrapeAll }
--- a/scraper/scheduler/scrapers.js
+++ b/scraper/scheduler/scrapers.js
@@ -0,0 +1,22 @@
+const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper');
+const horribleSubsScraper = require('../scrapers/horriblesubs/horriblesubs_scraper');
+const ytsScraper = require('../scrapers/yts/yts_scraper');
+const eztvScraper = require('../scrapers/eztv/eztv_scraper');
+const leetxScraper = require('../scrapers/1337x/1337x_scraper');
+const kickassScraper = require('../scrapers/kickass/kickass_scraper');
+const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
+
+module.exports = [
+  { scraper: ytsScraper, name: ytsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  { scraper: eztvScraper, name: eztvScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  { scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  { scraper: rarbgScraper, name: rarbgScraper.NAME, scrapeInterval: 2 * 60 * 60 * 1000 },
+  { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  { scraper: kickassScraper, name: kickassScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  { scraper: leetxScraper, name: leetxScraper.NAME, scrapeInterval: 4 * 60 * 60 * 1000 },
+  // { scraper: require('../scrapers/1337x/1337x_dump_scraper') }
+  // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
+  // { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') }
+  // { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') }
+  // { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') }
+];
--- a/scraper/scheduler/seeders.js
+++ b/scraper/scheduler/seeders.js
@@ -0,0 +1,41 @@
+const Bottleneck = require('bottleneck');
+const scrapers = require('./scrapers');
+const repository = require('../lib/repository')
+const { delay } = require('../lib/promises')
+const { updateCurrentSeeders } = require('../lib/torrent')
+const { updateTorrentSeeders } = require('../lib/torrentEntries')
+
+const DELAY = 15 * 1000; // 15 seconds
+const limiter = new Bottleneck({ maxConcurrent: 20, minTime: 250 });
+const forceSeedersLimiter = new Bottleneck({ maxConcurrent: 5 });
+
+function scheduleUpdateSeeders() {
+  console.log('Starting seeders update...')
+  return repository.getUpdateSeedersTorrents()
+      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => _updateSeeders(torrent)))))
+      .then(() => console.log('Finished seeders update'))
+      .then(() => delay(DELAY))
+      .then(() => scheduleUpdateSeeders());
+}
+
+async function _updateSeeders(torrent) {
+  const provider = await scrapers.find(provider => provider.name === torrent.provider);
+  const updatedTorrents = await provider.scraper.updateSeeders(torrent, getImdbIdsMethod(torrent))
+      .then(updated => Array.isArray(updated) ? updated : [updated])
+      .catch(() => []);
+
+  if (!updatedTorrents.find(updated => updated.infoHash === torrent.infoHash)) {
+    await forceSeedersLimiter.schedule(() => updateCurrentSeeders(torrent))
+        .then(updated => updatedTorrents.push(updated));
+  }
+
+  return Promise.all(updatedTorrents.map(updated => updateTorrentSeeders(updated)))
+}
+
+async function getImdbIdsMethod(torrent) {
+  return () => repository.getFiles(torrent)
+      .then(files => files.map(file => file.imdbId).filter(id => id))
+      .then(ids => Array.from(new Set(ids)));
+}
+
+module.exports = { scheduleUpdateSeeders }
--- a/scraper/scrapers/1337x/1337x_scraper.js
+++ b/scraper/scrapers/1337x/1337x_scraper.js
@@ -27,10 +27,7 @@ async function scrape() {
 }

 async function updateSeeders(torrent) {
-  return limiter.schedule(() => leetx.torrent(torrent.torrentId)
-      .then(record => (torrent.seeders = record.seeders, torrent))
-      .catch(() => updateCurrentSeeders(torrent))
-      .then(updated => updateTorrentSeeders(updated)));
+  return limiter.schedule(() => leetx.torrent(torrent.torrentId));
 }

 async function scrapeLatestTorrents() {
--- a/scraper/scrapers/eztv/eztv_api.js
+++ b/scraper/scrapers/eztv/eztv_api.js
@@ -33,7 +33,7 @@ function search(imdbId, config = {}, retries = 2) {
  return Promises.first(defaultProxies
      .map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config)))
      .then(results => parseResults(results))
-      .then(torrents => torrents.length === limit && page < maxPage
+      .then(torrents => torrents.length === limit && page < maxPage && !torrents.find(t => t.imdbId === imdbId)
          ? search(imdbId, { ...config, page: page + 1 })
              .catch(() => [])
              .then(nextTorrents => torrents.concat(nextTorrents))
--- a/scraper/scrapers/eztv/eztv_scraper.js
+++ b/scraper/scrapers/eztv/eztv_scraper.js
@@ -4,14 +4,12 @@ const eztv = require('./eztv_api');
 const { Type } = require('../../lib/types');
 const Promises = require('../../lib/promises');
 const repository = require('../../lib/repository');
-
-const { updateCurrentSeeders } = require('../../lib/torrent');
 const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries');

 const NAME = 'EZTV';
 const UNTIL_PAGE = 10;

-const limiter = new Bottleneck({ maxConcurrent: 20 });
+const limiter = new Bottleneck({ maxConcurrent: 1 });

 async function scrape() {
  const scrapeStart = moment();
@@ -26,11 +24,9 @@ async function scrape() {
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }

-async function updateSeeders(torrent) {
-  return limiter.schedule(() => eztv.torrent(torrent.torrentId)
-      .then(record => (torrent.seeders = record.seeders, torrent))
-      .catch(() => updateCurrentSeeders(torrent))
-      .then(updated => updateTorrentSeeders(updated)));
+async function updateSeeders(torrent, getImdbIdsMethod) {
+  return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
+      .map(imdbId => limiter.schedule(() => eztv.search(imdbId)))));
 }

 async function scrapeLatestTorrents() {
@@ -45,7 +41,7 @@ async function scrapeLatestTorrentsForCategory(page = 1) {
        // return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page))
        return Promise.resolve([]);
      })
-      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
+      .then(torrents => Promise.all(torrents.map(torrent => processTorrentRecord(torrent))))
      .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
          ? scrapeLatestTorrentsForCategory(page + 1)
          : Promise.resolve());
--- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
+++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
@@ -37,8 +37,7 @@ async function scrape() {
 }

 async function updateSeeders(torrent) {
-  return entryLimiter.schedule(() => updateCurrentSeeders(torrent)
-      .then(updated => updateTorrentSeeders(updated)));
+  return Promise.resolve([]);
 }

 async function _scrapeLatestEntries() {
--- a/scraper/scrapers/kickass/kickass_scraper.js
+++ b/scraper/scrapers/kickass/kickass_scraper.js
@@ -27,10 +27,7 @@ async function scrape() {
 }

 async function updateSeeders(torrent) {
-  return limiter.schedule(() => kickass.torrent(torrent.torrentId)
-      .then(record => (torrent.seeders = record.seeders, torrent))
-      .catch(() => updateCurrentSeeders(torrent))
-      .then(updated => updateTorrentSeeders(updated)));
+  return limiter.schedule(() => kickass.torrent(torrent.torrentId));
 }

 async function scrapeLatestTorrents() {
--- a/scraper/scrapers/rarbg/rarbg_scraper.js
+++ b/scraper/scrapers/rarbg/rarbg_scraper.js
@@ -26,15 +26,9 @@ async function scrape() {
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }

-async function updateSeeders(torrent) {
-  const imdbIds = await repository.getFiles(torrent)
-      .then(files => files.map(file => file.imdbId))
-      .then(ids => Array.from(new Set(ids)));
-
-  return Promise.all(imdbIds.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb'))))
-      .then(results => results.reduce((a, b) => a.concat(b), []))
-      .then(results => results.map(result => toTorrent(result)))
-      .then(torrents => Promise.all(torrents.map(updated => updateTorrentSeeders(updated))));
+async function updateSeeders(torrent, getImdbIdsMethod) {
+  return getImdbIdsMethod().then(imdbIds => Promises.sequence(imdbIds
+      .map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb')))));
 }

 async function scrapeLatestTorrents() {
--- a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js
+++ b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js
@@ -39,10 +39,7 @@ async function scrape() {
 }

 async function updateSeeders(torrent) {
-  return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId)
-      .then(record => (torrent.seeders = record.seeders, torrent))
-      .catch(() => updateCurrentSeeders(torrent))
-      .then(updated => updateTorrentSeeders(updated)));
+  return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId));
 }

 async function scrapeLatestTorrents() {
--- a/scraper/scrapers/yts/yts_scraper.js
+++ b/scraper/scrapers/yts/yts_scraper.js
@@ -24,9 +24,7 @@ async function scrape() {
 }

 async function updateSeeders(torrent) {
-  return limiter.schedule(() => yts.torrent(torrent.torrentId)
-      .then(records => records.map(record => ({ ...record, provider: NAME })))
-      .then(records => Promise.all(records.map(record => updateTorrentSeeders(record)))));
+  return limiter.schedule(() => yts.torrent(torrent.torrentId));
 }

 async function scrapeLatestTorrents() {