[scraper] add rutor scraper
This commit is contained in:
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1870,8 +1870,8 @@
|
||||
}
|
||||
},
|
||||
"parse-torrent-title": {
|
||||
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
|
||||
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
|
||||
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
|
||||
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
|
||||
"requires": {
|
||||
"moment": "^2.24.0"
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
"nodejs-bing": "^0.1.0",
|
||||
"nyaapi": "^2.3.3",
|
||||
"parse-torrent": "^6.1.2",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2",
|
||||
"pg": "^7.8.2",
|
||||
"pg-hstore": "^2.3.2",
|
||||
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#935a5c23ae809edbcd2a111526a7f74d6767c50d",
|
||||
|
||||
@@ -84,7 +84,7 @@ function escapeTitle(title) {
|
||||
.replace(/[\u0300-\u036F]/g, '')
|
||||
.replace(/&/g, 'and')
|
||||
.replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces
|
||||
.replace(/[^\w \-()+#@!']+/g, '') // remove all non-alphanumeric chars
|
||||
.replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars
|
||||
.replace(/\s{2,}/, ' ') // replace multiple spaces
|
||||
.trim();
|
||||
}
|
||||
@@ -102,7 +102,7 @@ async function getImdbId(info, type) {
|
||||
|
||||
return cacheWrapImdbId(key,
|
||||
() => new Promise((resolve, reject) => {
|
||||
nameToImdb({ name, year: info.year, type }, function (err, res) {
|
||||
nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) {
|
||||
if (res) {
|
||||
resolve(res);
|
||||
} else {
|
||||
|
||||
@@ -95,9 +95,12 @@ async function torrentFiles(torrent, timeout) {
|
||||
}
|
||||
|
||||
function getFilesFromObject(torrent) {
|
||||
if (torrent.files && torrent.files.length) {
|
||||
if (Array.isArray(torrent.files)) {
|
||||
return Promise.resolve(torrent.files);
|
||||
}
|
||||
if (typeof torrent.files === 'function') {
|
||||
return torrent.files();
|
||||
}
|
||||
return Promise.reject("No files in the object");
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const distance = require('jaro-winkler');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const Promises = require('../lib/promises');
|
||||
@@ -9,6 +10,7 @@ const { Type } = require('./types');
|
||||
const { isDisk } = require('./extension');
|
||||
|
||||
const MIN_SIZE = 5 * 1024 * 1024; // 5 MB
|
||||
const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 });
|
||||
|
||||
async function parseTorrentFiles(torrent) {
|
||||
const parsedTorrentName = parse(torrent.title);
|
||||
@@ -391,7 +393,7 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
|
||||
} else if (metadata.videos.some(video => video.kitsuEpisode)) {
|
||||
// imdb episode info is base
|
||||
files
|
||||
.filter(file => file.season && file.episodes)
|
||||
.filter(file => Number.isInteger(file.season) && file.episodes)
|
||||
.forEach(file => {
|
||||
if (seriesMapping[file.season]) {
|
||||
const seasonMapping = seriesMapping[file.season];
|
||||
@@ -460,7 +462,7 @@ async function updateToCinemetaMetadata(metadata) {
|
||||
|
||||
function findMovieImdbId(title) {
|
||||
const parsedTitle = typeof title === 'string' ? parse(title) : title;
|
||||
return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined);
|
||||
return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined));
|
||||
}
|
||||
|
||||
function findMovieKitsuId(title) {
|
||||
|
||||
@@ -6,7 +6,8 @@ const kickassScraper = require('../scrapers/kickass/kickass_scraper');
|
||||
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
|
||||
const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper');
|
||||
const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper');
|
||||
const torrentGalaxySiScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
|
||||
const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
|
||||
const rutorScraper = require('../scrapers/rutor/rutor_scraper');
|
||||
|
||||
module.exports = [
|
||||
{ scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
@@ -14,9 +15,10 @@ module.exports = [
|
||||
{ scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' },
|
||||
{ scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' },
|
||||
{ scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: torrentGalaxySiScraper, name: torrentGalaxySiScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
// { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
|
||||
// { scraper: require('../scrapers/1337x/1337x_search_scraper') }
|
||||
|
||||
195
scraper/scrapers/rutor/rutor_api.js
Normal file
195
scraper/scrapers/rutor/rutor_api.js
Normal file
@@ -0,0 +1,195 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
const { defaultOptionsWithProxy } = require('../../lib/requestHelper');
|
||||
|
||||
const baseUrl = 'http://www.rutor.info';
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
const Categories = {
|
||||
FOREIGN_FILMS: '1',
|
||||
RUSSIAN_FILMS: '5',
|
||||
SCIENCE_FILMS: '12',
|
||||
FOREIGN_SERIES: '4',
|
||||
RUSSIAN_SERIES: '16',
|
||||
RUSSIAN_TV: '6',
|
||||
RUSSIAN_ANIMATION: '7',
|
||||
ANIME: '10',
|
||||
FOREIGN_RELEASES: '17'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2, error = null) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return singleRequest(`${baseUrl}/torrent/${torrentId}`)
|
||||
.then((body) => parseTorrentPage(body, torrentId))
|
||||
.catch((err) => torrent(torrentId, config, retries - 1, err));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2, error = null) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1, err));
|
||||
}
|
||||
|
||||
function files(torrentId) {
|
||||
return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`)
|
||||
.then((body) => parseFiles(body));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl) {
|
||||
const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
|
||||
} else if (body.includes('Access Denied')) {
|
||||
console.log(`Access Denied: ${requestUrl}`);
|
||||
throw new Error(`Access Denied: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden') ||
|
||||
body.includes('Origin DNS error')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => {
|
||||
const row = $(elem).find('td');
|
||||
const links = $(row[1]).find('a');
|
||||
const peers = $(row[row.length - 1]);
|
||||
const magnetLink = $(links[1]).attr('href');
|
||||
|
||||
return {
|
||||
title: $(links[2]).text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
torrentLink: $(links[0]).attr('href'),
|
||||
torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1],
|
||||
seeds: parseInt(peers.find('.green').text()),
|
||||
leaches: parseInt(peers.find('.red').text()),
|
||||
uploadDate: parseRussianDate($(row[0]).text()),
|
||||
size: $(row[row.length - 2]).html().replace(' ', ' '),
|
||||
}
|
||||
}).get();
|
||||
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body, torrentId) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const rows = $('#details > tr')
|
||||
const details = $(rows[0]).find('td:nth-of-type(2)');
|
||||
const magnetLink = $('#download a:nth-of-type(1)').attr('href');
|
||||
const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i);
|
||||
|
||||
const parsedTorrent = {
|
||||
title: $('#all h1').first().text(),
|
||||
torrentId: torrentId,
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
trackers: Array.from(new Set(decode(magnetLink).tr)).join(','),
|
||||
magnetLink: magnetLink,
|
||||
torrentLink: $('#download a:nth-of-type(2)').attr('href'),
|
||||
seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10),
|
||||
category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1],
|
||||
languages: parseLanguages(details.text()),
|
||||
size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()),
|
||||
uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1]
|
||||
};
|
||||
resolve(parsedTorrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseFiles(body) {
|
||||
if (!body) {
|
||||
throw new Error("No files in the body");
|
||||
}
|
||||
return body.split('\n')
|
||||
.map((item) => item.match(/<td>([^<]+)<\/td>/g).slice(1))
|
||||
.map((item, index) => ({
|
||||
fileIndex: index,
|
||||
name: item[0].replace(/^.+\//g, ''),
|
||||
path: item[0].replace(/^.+\//, ''),
|
||||
size: parseSize(item[1])
|
||||
}));
|
||||
}
|
||||
|
||||
function parseDate(dateString) {
|
||||
const preparedDate = dateString.replace(/\s\(.*\)/, '')
|
||||
return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate();
|
||||
}
|
||||
|
||||
const russianMonths = {
|
||||
'Янв': 'Jan',
|
||||
'Фев': 'Feb',
|
||||
'Мар': 'Mar',
|
||||
'Апр': 'Apr',
|
||||
'Май': 'May',
|
||||
'Июн': 'Jun',
|
||||
'Июл': 'Jul',
|
||||
'Авг': 'Aug',
|
||||
'Сен': 'Sep',
|
||||
'Окт': 'Oct',
|
||||
'Ноя': 'Nov',
|
||||
'Дек': 'Dec'
|
||||
};
|
||||
|
||||
function parseRussianDate(dateString) {
|
||||
const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month));
|
||||
const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' ');
|
||||
return moment(preparedDate, 'DD MMM YY').toDate();
|
||||
}
|
||||
|
||||
function parseSize(sizeString) {
|
||||
return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10);
|
||||
}
|
||||
|
||||
const languageMatchers = {
|
||||
'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i,
|
||||
'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i,
|
||||
'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i,
|
||||
'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i,
|
||||
'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i,
|
||||
'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i,
|
||||
'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i,
|
||||
'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i,
|
||||
'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i,
|
||||
'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i
|
||||
}
|
||||
|
||||
function parseLanguages(details) {
|
||||
const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i);
|
||||
const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details;
|
||||
const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart));
|
||||
const languages = Array.from(new Set(['russian'].concat(matchedLanguages)));
|
||||
return languages.length > 4 ? 'multi-audio' : languages.join(',');
|
||||
}
|
||||
|
||||
module.exports = { torrent, browse, Categories };
|
||||
125
scraper/scrapers/rutor/rutor_scraper.js
Normal file
125
scraper/scrapers/rutor/rutor_scraper.js
Normal file
@@ -0,0 +1,125 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const rutor = require('./rutor_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'Rutor';
|
||||
const TYPE_MAPPING = {
|
||||
'kino': Type.MOVIE,
|
||||
'nashe_kino': Type.MOVIE,
|
||||
'nauchno_popularnoe': Type.MOVIE,
|
||||
'inostrannoe': Type.MOVIE,
|
||||
'seriali': Type.SERIES,
|
||||
'nashi_seriali': Type.SERIES,
|
||||
'tv': Type.SERIES,
|
||||
'multiki': Type.MOVIE,
|
||||
'anime': Type.ANIME
|
||||
}
|
||||
const ALLOWED_WITHOUT_IMDB = ['kino', 'seriali', 'anime'];
|
||||
|
||||
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
|
||||
const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
const allowedCategories = [
|
||||
rutor.Categories.FOREIGN_FILMS,
|
||||
rutor.Categories.FOREIGN_RELEASES,
|
||||
rutor.Categories.RUSSIAN_FILMS,
|
||||
rutor.Categories.FOREIGN_SERIES,
|
||||
rutor.Categories.RUSSIAN_SERIES,
|
||||
rutor.Categories.SCIENCE_FILMS,
|
||||
rutor.Categories.RUSSIAN_ANIMATION,
|
||||
rutor.Categories.ANIME
|
||||
];
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
// const ids = [
|
||||
// '637799'
|
||||
// ];
|
||||
// return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id))
|
||||
// .then(torrent => processTorrentRecord(torrent))))
|
||||
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 185) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return api_limiter.schedule(() => rutor.browse({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month'));
|
||||
if (record.seeders === 0 && isOld) {
|
||||
console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`)
|
||||
return record;
|
||||
}
|
||||
|
||||
const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined));
|
||||
|
||||
if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) {
|
||||
return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`);
|
||||
}
|
||||
if (!foundTorrent.imdbId && !ALLOWED_WITHOUT_IMDB.includes(foundTorrent.category)) {
|
||||
return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
provider: NAME,
|
||||
infoHash: foundTorrent.infoHash,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
torrentLink: foundTorrent.torrentLink,
|
||||
trackers: foundTorrent.trackers,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
size: foundTorrent.size,
|
||||
seeders: foundTorrent.seeders,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
imdbId: foundTorrent.imdbId,
|
||||
languages: foundTorrent.languages || undefined,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
function getMaxPage(category) {
|
||||
switch (category) {
|
||||
case rutor.Categories.FOREIGN_FILMS:
|
||||
case rutor.Categories.FOREIGN_SERIES:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
Reference in New Issue
Block a user