[scraper] add rutor scraper

This commit is contained in:
TheBeastLT
2021-02-15 12:38:43 +01:00
parent 0dac049107
commit e20ed50f17
8 changed files with 337 additions and 10 deletions

4
package-lock.json generated
View File

@@ -1870,8 +1870,8 @@
}
},
"parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
"requires": {
"moment": "^2.24.0"
}

View File

@@ -32,7 +32,7 @@
"nodejs-bing": "^0.1.0",
"nyaapi": "^2.3.3",
"parse-torrent": "^6.1.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
"pg": "^7.8.2",
"pg-hstore": "^2.3.2",
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#935a5c23ae809edbcd2a111526a7f74d6767c50d",

View File

@@ -84,7 +84,7 @@ function escapeTitle(title) {
.replace(/[\u0300-\u036F]/g, '')
.replace(/&/g, 'and')
.replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces
.replace(/[^\w \-()+#@!']+/g, '') // remove all non-alphanumeric chars
.replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars
.replace(/\s{2,}/, ' ') // replace multiple spaces
.trim();
}
@@ -102,7 +102,7 @@ async function getImdbId(info, type) {
return cacheWrapImdbId(key,
() => new Promise((resolve, reject) => {
nameToImdb({ name, year: info.year, type }, function (err, res) {
nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) {
if (res) {
resolve(res);
} else {

View File

@@ -95,9 +95,12 @@ async function torrentFiles(torrent, timeout) {
}
function getFilesFromObject(torrent) {
if (torrent.files && torrent.files.length) {
if (Array.isArray(torrent.files)) {
return Promise.resolve(torrent.files);
}
if (typeof torrent.files === 'function') {
return torrent.files();
}
return Promise.reject("No files in the object");
}

View File

@@ -1,4 +1,5 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const distance = require('jaro-winkler');
const { parse } = require('parse-torrent-title');
const Promises = require('../lib/promises');
@@ -9,6 +10,7 @@ const { Type } = require('./types');
const { isDisk } = require('./extension');
const MIN_SIZE = 5 * 1024 * 1024; // 5 MB
const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 });
async function parseTorrentFiles(torrent) {
const parsedTorrentName = parse(torrent.title);
@@ -391,7 +393,7 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
} else if (metadata.videos.some(video => video.kitsuEpisode)) {
// imdb episode info is base
files
.filter(file => file.season && file.episodes)
.filter(file => Number.isInteger(file.season) && file.episodes)
.forEach(file => {
if (seriesMapping[file.season]) {
const seasonMapping = seriesMapping[file.season];
@@ -460,7 +462,7 @@ async function updateToCinemetaMetadata(metadata) {
function findMovieImdbId(title) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined);
return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined));
}
function findMovieKitsuId(title) {

View File

@@ -6,7 +6,8 @@ const kickassScraper = require('../scrapers/kickass/kickass_scraper');
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper');
const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper');
const torrentGalaxySiScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
const rutorScraper = require('../scrapers/rutor/rutor_scraper');
module.exports = [
{ scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' },
@@ -14,9 +15,10 @@ module.exports = [
{ scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' },
{ scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' },
{ scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: torrentGalaxySiScraper, name: torrentGalaxySiScraper.NAME, cron: '0 0 */4 ? * *' },
// { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' },
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
// { scraper: require('../scrapers/1337x/1337x_search_scraper') }

View File

@@ -0,0 +1,195 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const decode = require('magnet-uri');
const { defaultOptionsWithProxy } = require('../../lib/requestHelper');
const baseUrl = 'http://www.rutor.info';
const defaultTimeout = 10000;
const Categories = {
FOREIGN_FILMS: '1',
RUSSIAN_FILMS: '5',
SCIENCE_FILMS: '12',
FOREIGN_SERIES: '4',
RUSSIAN_SERIES: '16',
RUSSIAN_TV: '6',
RUSSIAN_ANIMATION: '7',
ANIME: '10',
FOREIGN_RELEASES: '17'
};
function torrent(torrentId, config = {}, retries = 2, error = null) {
if (!torrentId || retries === 0) {
return Promise.reject(error || new Error(`Failed ${torrentId} search`));
}
return singleRequest(`${baseUrl}/torrent/${torrentId}`)
.then((body) => parseTorrentPage(body, torrentId))
.catch((err) => torrent(torrentId, config, retries - 1, err));
}
function browse(config = {}, retries = 2, error = null) {
if (retries === 0) {
return Promise.reject(error || new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1, err));
}
function files(torrentId) {
return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`)
.then((body) => parseFiles(body));
}
function singleRequest(requestUrl) {
const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
} else if (body.includes('Access Denied')) {
console.log(`Access Denied: ${requestUrl}`);
throw new Error(`Access Denied: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden') ||
body.includes('Origin DNS error')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => {
const row = $(elem).find('td');
const links = $(row[1]).find('a');
const peers = $(row[row.length - 1]);
const magnetLink = $(links[1]).attr('href');
return {
title: $(links[2]).text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
torrentLink: $(links[0]).attr('href'),
torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1],
seeds: parseInt(peers.find('.green').text()),
leaches: parseInt(peers.find('.red').text()),
uploadDate: parseRussianDate($(row[0]).text()),
size: $(row[row.length - 2]).html().replace(' ', ' '),
}
}).get();
resolve(torrents);
});
}
function parseTorrentPage(body, torrentId) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const rows = $('#details > tr')
const details = $(rows[0]).find('td:nth-of-type(2)');
const magnetLink = $('#download a:nth-of-type(1)').attr('href');
const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i);
const parsedTorrent = {
title: $('#all h1').first().text(),
torrentId: torrentId,
infoHash: decode(magnetLink).infoHash,
trackers: Array.from(new Set(decode(magnetLink).tr)).join(','),
magnetLink: magnetLink,
torrentLink: $('#download a:nth-of-type(2)').attr('href'),
seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10),
category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1],
languages: parseLanguages(details.text()),
size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()),
uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()),
imdbId: imdbIdMatch && imdbIdMatch[1]
};
resolve(parsedTorrent);
});
}
function parseFiles(body) {
if (!body) {
throw new Error("No files in the body");
}
return body.split('\n')
.map((item) => item.match(/<td>([^<]+)<\/td>/g).slice(1))
.map((item, index) => ({
fileIndex: index,
name: item[0].replace(/^.+\//g, ''),
path: item[0].replace(/^.+\//, ''),
size: parseSize(item[1])
}));
}
function parseDate(dateString) {
const preparedDate = dateString.replace(/\s\(.*\)/, '')
return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate();
}
const russianMonths = {
'Янв': 'Jan',
'Фев': 'Feb',
'Мар': 'Mar',
'Апр': 'Apr',
'Май': 'May',
'Июн': 'Jun',
'Июл': 'Jul',
'Авг': 'Aug',
'Сен': 'Sep',
'Окт': 'Oct',
'Ноя': 'Nov',
'Дек': 'Dec'
};
function parseRussianDate(dateString) {
const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month));
const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' ');
return moment(preparedDate, 'DD MMM YY').toDate();
}
function parseSize(sizeString) {
return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10);
}
const languageMatchers = {
'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i,
'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i,
'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i,
'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i,
'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i,
'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i,
'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i,
'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i,
'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i,
'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i
}
function parseLanguages(details) {
const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i);
const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details;
const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart));
const languages = Array.from(new Set(['russian'].concat(matchedLanguages)));
return languages.length > 4 ? 'multi-audio' : languages.join(',');
}
module.exports = { torrent, browse, Categories };

View File

@@ -0,0 +1,125 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const rutor = require('./rutor_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'Rutor';
const TYPE_MAPPING = {
'kino': Type.MOVIE,
'nashe_kino': Type.MOVIE,
'nauchno_popularnoe': Type.MOVIE,
'inostrannoe': Type.MOVIE,
'seriali': Type.SERIES,
'nashi_seriali': Type.SERIES,
'tv': Type.SERIES,
'multiki': Type.MOVIE,
'anime': Type.ANIME
}
const ALLOWED_WITHOUT_IMDB = ['kino', 'seriali', 'anime'];
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
const limiter = new Bottleneck({ maxConcurrent: 10 });
const allowedCategories = [
rutor.Categories.FOREIGN_FILMS,
rutor.Categories.FOREIGN_RELEASES,
rutor.Categories.RUSSIAN_FILMS,
rutor.Categories.FOREIGN_SERIES,
rutor.Categories.RUSSIAN_SERIES,
rutor.Categories.SCIENCE_FILMS,
rutor.Categories.RUSSIAN_ANIMATION,
rutor.Categories.ANIME
];
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
// const ids = [
// '637799'
// ];
// return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id))
// .then(torrent => processTorrentRecord(torrent))))
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 185) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return api_limiter.schedule(() => rutor.browse({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month'));
if (record.seeders === 0 && isOld) {
console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`)
return record;
}
const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined));
if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) {
return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`);
}
if (!foundTorrent.imdbId && !ALLOWED_WITHOUT_IMDB.includes(foundTorrent.category)) {
return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`);
}
const torrent = {
provider: NAME,
infoHash: foundTorrent.infoHash,
torrentId: foundTorrent.torrentId,
torrentLink: foundTorrent.torrentLink,
trackers: foundTorrent.trackers,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
size: foundTorrent.size,
seeders: foundTorrent.seeders,
uploadDate: foundTorrent.uploadDate,
imdbId: foundTorrent.imdbId,
languages: foundTorrent.languages || undefined,
};
return createTorrentEntry(torrent).then(() => torrent);
}
function getMaxPage(category) {
switch (category) {
case rutor.Categories.FOREIGN_FILMS:
case rutor.Categories.FOREIGN_SERIES:
return 2;
default:
return 1;
}
}
module.exports = { scrape, updateSeeders, NAME };