[scraper] add rutor scraper

2021-02-15 12:38:43 +01:00
parent 0dac049107
commit e20ed50f17
8 changed files with 337 additions and 10 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -1870,8 +1870,8 @@
      }
    },
    "parse-torrent-title": {
-      "version": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
-      "from": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
+      "version": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
+      "from": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
      "requires": {
        "moment": "^2.24.0"
      }
--- a/package.json
+++ b/package.json
@@ -32,7 +32,7 @@
    "nodejs-bing": "^0.1.0",
    "nyaapi": "^2.3.3",
    "parse-torrent": "^6.1.2",
-    "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
+    "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
    "pg": "^7.8.2",
    "pg-hstore": "^2.3.2",
    "real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#935a5c23ae809edbcd2a111526a7f74d6767c50d",
--- a/scraper/lib/metadata.js
+++ b/scraper/lib/metadata.js
@@ -84,7 +84,7 @@ function escapeTitle(title) {
      .replace(/[\u0300-\u036F]/g, '')
      .replace(/&/g, 'and')
      .replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces
-      .replace(/[^\w \-()+#@!']+/g, '') // remove all non-alphanumeric chars
+      .replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars
      .replace(/\s{2,}/, ' ') // replace multiple spaces
      .trim();
 }
@@ -102,7 +102,7 @@ async function getImdbId(info, type) {

  return cacheWrapImdbId(key,
      () => new Promise((resolve, reject) => {
-        nameToImdb({ name, year: info.year, type }, function (err, res) {
+        nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) {
          if (res) {
            resolve(res);
          } else {
--- a/scraper/lib/torrent.js
+++ b/scraper/lib/torrent.js
@@ -95,9 +95,12 @@ async function torrentFiles(torrent, timeout) {
 }

 function getFilesFromObject(torrent) {
-  if (torrent.files && torrent.files.length) {
+  if (Array.isArray(torrent.files)) {
    return Promise.resolve(torrent.files);
  }
+  if (typeof torrent.files === 'function') {
+    return torrent.files();
+  }
  return Promise.reject("No files in the object");
 }

--- a/scraper/lib/torrentFiles.js
+++ b/scraper/lib/torrentFiles.js
@@ -1,4 +1,5 @@
 const moment = require('moment');
+const Bottleneck = require('bottleneck');
 const distance = require('jaro-winkler');
 const { parse } = require('parse-torrent-title');
 const Promises = require('../lib/promises');
@@ -9,6 +10,7 @@ const { Type } = require('./types');
 const { isDisk } = require('./extension');

 const MIN_SIZE = 5 * 1024 * 1024; // 5 MB
+const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 });

 async function parseTorrentFiles(torrent) {
  const parsedTorrentName = parse(torrent.title);
@@ -391,7 +393,7 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
  } else if (metadata.videos.some(video => video.kitsuEpisode)) {
    // imdb episode info is base
    files
-        .filter(file => file.season && file.episodes)
+        .filter(file => Number.isInteger(file.season) && file.episodes)
        .forEach(file => {
          if (seriesMapping[file.season]) {
            const seasonMapping = seriesMapping[file.season];
@@ -460,7 +462,7 @@ async function updateToCinemetaMetadata(metadata) {

 function findMovieImdbId(title) {
  const parsedTitle = typeof title === 'string' ? parse(title) : title;
-  return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined);
+  return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined));
 }

 function findMovieKitsuId(title) {
--- a/scraper/scheduler/scrapers.js
+++ b/scraper/scheduler/scrapers.js
@@ -6,7 +6,8 @@ const kickassScraper = require('../scrapers/kickass/kickass_scraper');
 const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
 const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper');
 const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper');
-const torrentGalaxySiScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
+const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
+const rutorScraper = require('../scrapers/rutor/rutor_scraper');

 module.exports = [
  { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' },
@@ -14,9 +15,10 @@ module.exports = [
  { scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' },
  { scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' },
  { scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' },
+  { scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' },
  { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' },
+  { scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' },
  { scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' },
-  { scraper: torrentGalaxySiScraper, name: torrentGalaxySiScraper.NAME, cron: '0 0 */4 ? * *' },
  // { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' },
  // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
  // { scraper: require('../scrapers/1337x/1337x_search_scraper') }
--- a/scraper/scrapers/rutor/rutor_api.js
+++ b/scraper/scrapers/rutor/rutor_api.js
@@ -0,0 +1,195 @@
+const cheerio = require('cheerio');
+const needle = require('needle');
+const moment = require('moment');
+const decode = require('magnet-uri');
+const { defaultOptionsWithProxy } = require('../../lib/requestHelper');
+
+const baseUrl = 'http://www.rutor.info';
+const defaultTimeout = 10000;
+
+const Categories = {
+  FOREIGN_FILMS: '1',
+  RUSSIAN_FILMS: '5',
+  SCIENCE_FILMS: '12',
+  FOREIGN_SERIES: '4',
+  RUSSIAN_SERIES: '16',
+  RUSSIAN_TV: '6',
+  RUSSIAN_ANIMATION: '7',
+  ANIME: '10',
+  FOREIGN_RELEASES: '17'
+};
+
+function torrent(torrentId, config = {}, retries = 2, error = null) {
+  if (!torrentId || retries === 0) {
+    return Promise.reject(error || new Error(`Failed ${torrentId} search`));
+  }
+
+  return singleRequest(`${baseUrl}/torrent/${torrentId}`)
+      .then((body) => parseTorrentPage(body, torrentId))
+      .catch((err) => torrent(torrentId, config, retries - 1, err));
+}
+
+function browse(config = {}, retries = 2, error = null) {
+  if (retries === 0) {
+    return Promise.reject(error || new Error(`Failed browse request`));
+  }
+  const page = config.page || 1;
+  const category = config.category;
+
+  return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`)
+      .then((body) => parseTableBody(body))
+      .catch((err) => browse(config, retries - 1, err));
+}
+
+function files(torrentId) {
+  return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`)
+      .then((body) => parseFiles(body));
+}
+
+function singleRequest(requestUrl) {
+  const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 };
+
+  return needle('get', requestUrl, options)
+      .then((response) => {
+        const body = response.body;
+        if (!body) {
+          throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
+        } else if (body.includes('Access Denied')) {
+          console.log(`Access Denied: ${requestUrl}`);
+          throw new Error(`Access Denied: ${requestUrl}`);
+        } else if (body.includes('502: Bad gateway') ||
+            body.includes('403 Forbidden') ||
+            body.includes('Origin DNS error')) {
+          throw new Error(`Invalid body contents: ${requestUrl}`);
+        }
+        return body;
+      });
+}
+
+function parseTableBody(body) {
+  return new Promise((resolve, reject) => {
+    const $ = cheerio.load(body);
+
+    if (!$) {
+      reject(new Error('Failed loading body'));
+    }
+
+    const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => {
+      const row = $(elem).find('td');
+      const links = $(row[1]).find('a');
+      const peers = $(row[row.length - 1]);
+      const magnetLink = $(links[1]).attr('href');
+
+      return {
+        title: $(links[2]).text(),
+        infoHash: decode(magnetLink).infoHash,
+        magnetLink: magnetLink,
+        torrentLink: $(links[0]).attr('href'),
+        torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1],
+        seeds: parseInt(peers.find('.green').text()),
+        leaches: parseInt(peers.find('.red').text()),
+        uploadDate: parseRussianDate($(row[0]).text()),
+        size: $(row[row.length - 2]).html().replace('&#xA0;', ' '),
+      }
+    }).get();
+
+    resolve(torrents);
+  });
+}
+
+function parseTorrentPage(body, torrentId) {
+  return new Promise((resolve, reject) => {
+    const $ = cheerio.load(body);
+
+    if (!$) {
+      reject(new Error('Failed loading body'));
+    }
+    const rows = $('#details > tr')
+    const details = $(rows[0]).find('td:nth-of-type(2)');
+    const magnetLink = $('#download a:nth-of-type(1)').attr('href');
+    const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i);
+
+    const parsedTorrent = {
+      title: $('#all h1').first().text(),
+      torrentId: torrentId,
+      infoHash: decode(magnetLink).infoHash,
+      trackers: Array.from(new Set(decode(magnetLink).tr)).join(','),
+      magnetLink: magnetLink,
+      torrentLink: $('#download a:nth-of-type(2)').attr('href'),
+      seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10),
+      category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1],
+      languages: parseLanguages(details.text()),
+      size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()),
+      uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()),
+      imdbId: imdbIdMatch && imdbIdMatch[1]
+    };
+    resolve(parsedTorrent);
+  });
+}
+
+function parseFiles(body) {
+  if (!body) {
+    throw new Error("No files in the body");
+  }
+  return body.split('\n')
+      .map((item) => item.match(/<td>([^<]+)<\/td>/g).slice(1))
+      .map((item, index) => ({
+        fileIndex: index,
+        name: item[0].replace(/^.+\//g, ''),
+        path: item[0].replace(/^.+\//, ''),
+        size: parseSize(item[1])
+      }));
+}
+
+function parseDate(dateString) {
+  const preparedDate = dateString.replace(/\s\(.*\)/, '')
+  return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate();
+}
+
+const russianMonths = {
+  'Янв': 'Jan',
+  'Фев': 'Feb',
+  'Мар': 'Mar',
+  'Апр': 'Apr',
+  'Май': 'May',
+  'Июн': 'Jun',
+  'Июл': 'Jul',
+  'Авг': 'Aug',
+  'Сен': 'Sep',
+  'Окт': 'Oct',
+  'Ноя': 'Nov',
+  'Дек': 'Dec'
+};
+
+function parseRussianDate(dateString) {
+  const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month));
+  const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' ');
+  return moment(preparedDate, 'DD MMM YY').toDate();
+}
+
+function parseSize(sizeString) {
+  return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10);
+}
+
+const languageMatchers = {
+  'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i,
+  'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i,
+  'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i,
+  'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i,
+  'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i,
+  'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i,
+  'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i,
+  'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i,
+  'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i,
+  'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i
+}
+
+function parseLanguages(details) {
+  const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i);
+  const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details;
+  const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart));
+  const languages = Array.from(new Set(['russian'].concat(matchedLanguages)));
+  return languages.length > 4 ? 'multi-audio' : languages.join(',');
+}
+
+module.exports = { torrent, browse, Categories };
--- a/scraper/scrapers/rutor/rutor_scraper.js
+++ b/scraper/scrapers/rutor/rutor_scraper.js
@@ -0,0 +1,125 @@
+const moment = require('moment');
+const Bottleneck = require('bottleneck');
+const rutor = require('./rutor_api');
+const { Type } = require('../../lib/types');
+const repository = require('../../lib/repository');
+const Promises = require('../../lib/promises');
+const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
+
+const NAME = 'Rutor';
+const TYPE_MAPPING = {
+  'kino': Type.MOVIE,
+  'nashe_kino': Type.MOVIE,
+  'nauchno_popularnoe': Type.MOVIE,
+  'inostrannoe': Type.MOVIE,
+  'seriali': Type.SERIES,
+  'nashi_seriali': Type.SERIES,
+  'tv': Type.SERIES,
+  'multiki': Type.MOVIE,
+  'anime': Type.ANIME
+}
+const ALLOWED_WITHOUT_IMDB = ['kino', 'seriali', 'anime'];
+
+const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
+const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
+const limiter = new Bottleneck({ maxConcurrent: 10 });
+const allowedCategories = [
+  rutor.Categories.FOREIGN_FILMS,
+  rutor.Categories.FOREIGN_RELEASES,
+  rutor.Categories.RUSSIAN_FILMS,
+  rutor.Categories.FOREIGN_SERIES,
+  rutor.Categories.RUSSIAN_SERIES,
+  rutor.Categories.SCIENCE_FILMS,
+  rutor.Categories.RUSSIAN_ANIMATION,
+  rutor.Categories.ANIME
+];
+
+async function scrape() {
+  const scrapeStart = moment();
+  const lastScrape = await repository.getProvider({ name: NAME });
+  console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
+
+  // const ids = [
+  //   '637799'
+  // ];
+  // return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id))
+  //     .then(torrent => processTorrentRecord(torrent))))
+  //     .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
+  return scrapeLatestTorrents()
+      .then(() => {
+        lastScrape.lastScraped = scrapeStart;
+        return lastScrape.save();
+      })
+      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
+}
+
+async function updateSeeders(torrent) {
+  return Promise.resolve([]);
+}
+
+async function scrapeLatestTorrents() {
+  return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
+      .then(entries => entries.reduce((a, b) => a.concat(b), []));
+}
+
+async function scrapeLatestTorrentsForCategory(category, page = 185) {
+  console.log(`Scrapping ${NAME} ${category} category page ${page}`);
+  return api_limiter.schedule(() => rutor.browse({ category, page }))
+      .catch(error => {
+        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
+        return Promise.resolve([]);
+      })
+      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
+      .then(resolved => resolved.length > 0 && page < getMaxPage(category)
+          ? scrapeLatestTorrentsForCategory(category, page + 1)
+          : Promise.resolve());
+}
+
+async function processTorrentRecord(record) {
+  if (await checkAndUpdateTorrent(record)) {
+    return record;
+  }
+  const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month'));
+  if (record.seeders === 0 && isOld) {
+    console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`)
+    return record;
+  }
+
+  const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined));
+
+  if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) {
+    return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`);
+  }
+  if (!foundTorrent.imdbId && !ALLOWED_WITHOUT_IMDB.includes(foundTorrent.category)) {
+    return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`);
+  }
+
+  const torrent = {
+    provider: NAME,
+    infoHash: foundTorrent.infoHash,
+    torrentId: foundTorrent.torrentId,
+    torrentLink: foundTorrent.torrentLink,
+    trackers: foundTorrent.trackers,
+    title: foundTorrent.title,
+    type: TYPE_MAPPING[foundTorrent.category],
+    size: foundTorrent.size,
+    seeders: foundTorrent.seeders,
+    uploadDate: foundTorrent.uploadDate,
+    imdbId: foundTorrent.imdbId,
+    languages: foundTorrent.languages || undefined,
+  };
+
+  return createTorrentEntry(torrent).then(() => torrent);
+}
+
+function getMaxPage(category) {
+  switch (category) {
+    case rutor.Categories.FOREIGN_FILMS:
+    case rutor.Categories.FOREIGN_SERIES:
+      return 2;
+    default:
+      return 1;
+  }
+}
+
+module.exports = { scrape, updateSeeders, NAME };