diff --git a/package-lock.json b/package-lock.json index fd42a52..7342487 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1870,8 +1870,8 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26", - "from": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2", + "from": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2", "requires": { "moment": "^2.24.0" } diff --git a/package.json b/package.json index f9ff371..f090510 100644 --- a/package.json +++ b/package.json @@ -32,7 +32,7 @@ "nodejs-bing": "^0.1.0", "nyaapi": "^2.3.3", "parse-torrent": "^6.1.2", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#e8866fdd3f5659c20847b926fb5d297f7634ad26", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#d570320c4c6370ce71509b97cc67b5edf0b094b2", "pg": "^7.8.2", "pg-hstore": "^2.3.2", "real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#935a5c23ae809edbcd2a111526a7f74d6767c50d", diff --git a/scraper/lib/metadata.js b/scraper/lib/metadata.js index 0de2cbf..bf171b0 100644 --- a/scraper/lib/metadata.js +++ b/scraper/lib/metadata.js @@ -84,7 +84,7 @@ function escapeTitle(title) { .replace(/[\u0300-\u036F]/g, '') .replace(/&/g, 'and') .replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces - .replace(/[^\w \-()+#@!']+/g, '') // remove all non-alphanumeric chars + .replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars .replace(/\s{2,}/, ' ') // replace multiple spaces .trim(); } @@ -102,7 +102,7 @@ async function getImdbId(info, type) { return cacheWrapImdbId(key, () => new Promise((resolve, reject) => { - nameToImdb({ name, year: info.year, type }, function (err, res) { + nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) { if (res) { resolve(res); } else { diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index 1cc2d50..a371355 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -95,9 +95,12 @@ async function torrentFiles(torrent, timeout) { } function getFilesFromObject(torrent) { - if (torrent.files && torrent.files.length) { + if (Array.isArray(torrent.files)) { return Promise.resolve(torrent.files); } + if (typeof torrent.files === 'function') { + return torrent.files(); + } return Promise.reject("No files in the object"); } diff --git a/scraper/lib/torrentFiles.js b/scraper/lib/torrentFiles.js index c4be8d0..69a982e 100644 --- a/scraper/lib/torrentFiles.js +++ b/scraper/lib/torrentFiles.js @@ -1,4 +1,5 @@ const moment = require('moment'); +const Bottleneck = require('bottleneck'); const distance = require('jaro-winkler'); const { parse } = require('parse-torrent-title'); const Promises = require('../lib/promises'); @@ -9,6 +10,7 @@ const { Type } = require('./types'); const { isDisk } = require('./extension'); const MIN_SIZE = 5 * 1024 * 1024; // 5 MB +const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 }); async function parseTorrentFiles(torrent) { const parsedTorrentName = parse(torrent.title); @@ -391,7 +393,7 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) { } else if (metadata.videos.some(video => video.kitsuEpisode)) { // imdb episode info is base files - .filter(file => file.season && file.episodes) + .filter(file => Number.isInteger(file.season) && file.episodes) .forEach(file => { if (seriesMapping[file.season]) { const seasonMapping = seriesMapping[file.season]; @@ -460,7 +462,7 @@ async function updateToCinemetaMetadata(metadata) { function findMovieImdbId(title) { const parsedTitle = typeof title === 'string' ? parse(title) : title; - return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined); + return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined)); } function findMovieKitsuId(title) { diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js index 3641e95..9e1bdcc 100644 --- a/scraper/scheduler/scrapers.js +++ b/scraper/scheduler/scrapers.js @@ -6,7 +6,8 @@ const kickassScraper = require('../scrapers/kickass/kickass_scraper'); const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper'); const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper'); const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper'); -const torrentGalaxySiScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper'); +const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper'); +const rutorScraper = require('../scrapers/rutor/rutor_scraper'); module.exports = [ { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' }, @@ -14,9 +15,10 @@ module.exports = [ { scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' }, + { scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' }, + { scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' }, - { scraper: torrentGalaxySiScraper, name: torrentGalaxySiScraper.NAME, cron: '0 0 */4 ? * *' }, // { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' }, // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') } // { scraper: require('../scrapers/1337x/1337x_search_scraper') } diff --git a/scraper/scrapers/rutor/rutor_api.js b/scraper/scrapers/rutor/rutor_api.js new file mode 100644 index 0000000..bfc3eb2 --- /dev/null +++ b/scraper/scrapers/rutor/rutor_api.js @@ -0,0 +1,195 @@ +const cheerio = require('cheerio'); +const needle = require('needle'); +const moment = require('moment'); +const decode = require('magnet-uri'); +const { defaultOptionsWithProxy } = require('../../lib/requestHelper'); + +const baseUrl = 'http://www.rutor.info'; +const defaultTimeout = 10000; + +const Categories = { + FOREIGN_FILMS: '1', + RUSSIAN_FILMS: '5', + SCIENCE_FILMS: '12', + FOREIGN_SERIES: '4', + RUSSIAN_SERIES: '16', + RUSSIAN_TV: '6', + RUSSIAN_ANIMATION: '7', + ANIME: '10', + FOREIGN_RELEASES: '17' +}; + +function torrent(torrentId, config = {}, retries = 2, error = null) { + if (!torrentId || retries === 0) { + return Promise.reject(error || new Error(`Failed ${torrentId} search`)); + } + + return singleRequest(`${baseUrl}/torrent/${torrentId}`) + .then((body) => parseTorrentPage(body, torrentId)) + .catch((err) => torrent(torrentId, config, retries - 1, err)); +} + +function browse(config = {}, retries = 2, error = null) { + if (retries === 0) { + return Promise.reject(error || new Error(`Failed browse request`)); + } + const page = config.page || 1; + const category = config.category; + + return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1, err)); +} + +function files(torrentId) { + return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`) + .then((body) => parseFiles(body)); +} + +function singleRequest(requestUrl) { + const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`); + } else if (body.includes('Access Denied')) { + console.log(`Access Denied: ${requestUrl}`); + throw new Error(`Access Denied: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden') || + body.includes('Origin DNS error')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => { + const row = $(elem).find('td'); + const links = $(row[1]).find('a'); + const peers = $(row[row.length - 1]); + const magnetLink = $(links[1]).attr('href'); + + return { + title: $(links[2]).text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + torrentLink: $(links[0]).attr('href'), + torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1], + seeds: parseInt(peers.find('.green').text()), + leaches: parseInt(peers.find('.red').text()), + uploadDate: parseRussianDate($(row[0]).text()), + size: $(row[row.length - 2]).html().replace(' ', ' '), + } + }).get(); + + resolve(torrents); + }); +} + +function parseTorrentPage(body, torrentId) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + const rows = $('#details > tr') + const details = $(rows[0]).find('td:nth-of-type(2)'); + const magnetLink = $('#download a:nth-of-type(1)').attr('href'); + const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i); + + const parsedTorrent = { + title: $('#all h1').first().text(), + torrentId: torrentId, + infoHash: decode(magnetLink).infoHash, + trackers: Array.from(new Set(decode(magnetLink).tr)).join(','), + magnetLink: magnetLink, + torrentLink: $('#download a:nth-of-type(2)').attr('href'), + seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10), + category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1], + languages: parseLanguages(details.text()), + size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()), + uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()), + imdbId: imdbIdMatch && imdbIdMatch[1] + }; + resolve(parsedTorrent); + }); +} + +function parseFiles(body) { + if (!body) { + throw new Error("No files in the body"); + } + return body.split('\n') + .map((item) => item.match(/([^<]+)<\/td>/g).slice(1)) + .map((item, index) => ({ + fileIndex: index, + name: item[0].replace(/^.+\//g, ''), + path: item[0].replace(/^.+\//, ''), + size: parseSize(item[1]) + })); +} + +function parseDate(dateString) { + const preparedDate = dateString.replace(/\s\(.*\)/, '') + return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate(); +} + +const russianMonths = { + 'Янв': 'Jan', + 'Фев': 'Feb', + 'Мар': 'Mar', + 'Апр': 'Apr', + 'Май': 'May', + 'Июн': 'Jun', + 'Июл': 'Jul', + 'Авг': 'Aug', + 'Сен': 'Sep', + 'Окт': 'Oct', + 'Ноя': 'Nov', + 'Дек': 'Dec' +}; + +function parseRussianDate(dateString) { + const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month)); + const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' '); + return moment(preparedDate, 'DD MMM YY').toDate(); +} + +function parseSize(sizeString) { + return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10); +} + +const languageMatchers = { + 'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i, + 'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i, + 'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i, + 'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i, + 'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i, + 'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i, + 'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i, + 'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i, + 'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i, + 'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i +} + +function parseLanguages(details) { + const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i); + const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details; + const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart)); + const languages = Array.from(new Set(['russian'].concat(matchedLanguages))); + return languages.length > 4 ? 'multi-audio' : languages.join(','); +} + +module.exports = { torrent, browse, Categories }; diff --git a/scraper/scrapers/rutor/rutor_scraper.js b/scraper/scrapers/rutor/rutor_scraper.js new file mode 100644 index 0000000..3e8b9e1 --- /dev/null +++ b/scraper/scrapers/rutor/rutor_scraper.js @@ -0,0 +1,125 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const rutor = require('./rutor_api'); +const { Type } = require('../../lib/types'); +const repository = require('../../lib/repository'); +const Promises = require('../../lib/promises'); +const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries'); + +const NAME = 'Rutor'; +const TYPE_MAPPING = { + 'kino': Type.MOVIE, + 'nashe_kino': Type.MOVIE, + 'nauchno_popularnoe': Type.MOVIE, + 'inostrannoe': Type.MOVIE, + 'seriali': Type.SERIES, + 'nashi_seriali': Type.SERIES, + 'tv': Type.SERIES, + 'multiki': Type.MOVIE, + 'anime': Type.ANIME +} +const ALLOWED_WITHOUT_IMDB = ['kino', 'seriali', 'anime']; + +const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 }); +const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 }); +const limiter = new Bottleneck({ maxConcurrent: 10 }); +const allowedCategories = [ + rutor.Categories.FOREIGN_FILMS, + rutor.Categories.FOREIGN_RELEASES, + rutor.Categories.RUSSIAN_FILMS, + rutor.Categories.FOREIGN_SERIES, + rutor.Categories.RUSSIAN_SERIES, + rutor.Categories.SCIENCE_FILMS, + rutor.Categories.RUSSIAN_ANIMATION, + rutor.Categories.ANIME +]; + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + // const ids = [ + // '637799' + // ]; + // return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id)) + // .then(torrent => processTorrentRecord(torrent)))) + // .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return Promise.resolve([]); +} + +async function scrapeLatestTorrents() { + return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 185) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return api_limiter.schedule(() => rutor.browse({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) + .then(resolved => resolved.length > 0 && page < getMaxPage(category) + ? scrapeLatestTorrentsForCategory(category, page + 1) + : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent(record)) { + return record; + } + const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month')); + if (record.seeders === 0 && isOld) { + console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`) + return record; + } + + const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined)); + + if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) { + return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`); + } + if (!foundTorrent.imdbId && !ALLOWED_WITHOUT_IMDB.includes(foundTorrent.category)) { + return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`); + } + + const torrent = { + provider: NAME, + infoHash: foundTorrent.infoHash, + torrentId: foundTorrent.torrentId, + torrentLink: foundTorrent.torrentLink, + trackers: foundTorrent.trackers, + title: foundTorrent.title, + type: TYPE_MAPPING[foundTorrent.category], + size: foundTorrent.size, + seeders: foundTorrent.seeders, + uploadDate: foundTorrent.uploadDate, + imdbId: foundTorrent.imdbId, + languages: foundTorrent.languages || undefined, + }; + + return createTorrentEntry(torrent).then(() => torrent); +} + +function getMaxPage(category) { + switch (category) { + case rutor.Categories.FOREIGN_FILMS: + case rutor.Categories.FOREIGN_SERIES: + return 2; + default: + return 1; + } +} + +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file