From 2351d92885b4b5e91feec46386a8a78a46f80c5b Mon Sep 17 00:00:00 2001 From: Gabisonfire Date: Wed, 17 Jan 2024 18:22:01 -0500 Subject: [PATCH] Updated scrapers to latest available commit --- scraper/scrapers/erairaws/erairaws_rss_api.js | 50 +++++++ scraper/scrapers/torrent9/torrent9_api.js | 119 +++++++++++++++++ scraper/scrapers/torrent9/torrent9_scraper.js | 104 +++++++++++++++ scraper/scrapers/torrent9/torrent9v2_api.js | 124 ++++++++++++++++++ scraper/scrapers/yts/yts_full_scraper.js | 15 +++ 5 files changed, 412 insertions(+) create mode 100644 scraper/scrapers/erairaws/erairaws_rss_api.js create mode 100644 scraper/scrapers/torrent9/torrent9_api.js create mode 100644 scraper/scrapers/torrent9/torrent9_scraper.js create mode 100644 scraper/scrapers/torrent9/torrent9v2_api.js create mode 100644 scraper/scrapers/yts/yts_full_scraper.js diff --git a/scraper/scrapers/erairaws/erairaws_rss_api.js b/scraper/scrapers/erairaws/erairaws_rss_api.js new file mode 100644 index 0000000..98c5745 --- /dev/null +++ b/scraper/scrapers/erairaws/erairaws_rss_api.js @@ -0,0 +1,50 @@ +const Parser = require('rss-parser'); +const decode = require("magnet-uri"); + +const parser = new Parser({ + customFields: { + item: [['erai:subtitles', 'subtitles']] + } +}); +const baseUrl = 'https://www.erai-raws.info'; +const rssKey = process.env.ERAI_RSS_KEY; + +const Categories = { + ANIMES: 'anime', + EPISODES: 'episodes' +}; + +function browse() { + return parser.parseURL(`${baseUrl}/feed/?type=magnet&${rssKey}`) + .then(result => result.items + .map(item => { + const decodedMagnet = decode(item.link); + const languages = parseLanguages(item.subtitles); + return { + title: decodedMagnet.name, + infoHash: decodedMagnet.infoHash, + trackers: decodedMagnet.tr, + languages: languages + } + })); +} + +const languageMapping = { + 'us': 'English', + 'br': 'Portuguese(Brazil)', + 'mx': 'Spanish(Latin_America)', + 'es': 'Spanish', + 'sa': 'Arabic', + 'fr': 'French', + 'de': 'German', + 'it': 'Italian', + 'ru': 'Russian' +} +function parseLanguages(languages) { + return languages.split('][') + .map(lang => lang.replace(/[\[\]]/g, '')) + .map(lang => languageMapping[lang] || lang) + .join('/'); +} + +module.exports = { browse, Categories }; diff --git a/scraper/scrapers/torrent9/torrent9_api.js b/scraper/scrapers/torrent9/torrent9_api.js new file mode 100644 index 0000000..0c409aa --- /dev/null +++ b/scraper/scrapers/torrent9/torrent9_api.js @@ -0,0 +1,119 @@ +const axios = require('axios'); +const cheerio = require('cheerio'); +const moment = require('moment'); +const decode = require('magnet-uri'); +const { parse } = require('parse-torrent-title'); +const { getRandomUserAgent } = require('../../lib/requestHelper'); +const { parseSize } = require("../scraperHelper"); + +const baseUrl = 'https://www.torrent9.re' +const defaultTimeout = 10000; +const pageSize = 50; + +const Categories = { + MOVIE: 'films', + TV: 'series', +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + + return singleRequest(`${baseUrl}/torrent/${torrentId}`) + .then((body) => parseTorrentPage(body)) + .then((torrent) => ({ torrentId, ...torrent })) + .catch((err) => { + console.warn(`Failed Torrent9 ${torrentId} request: `, err); + return torrent(torrentId, config, retries - 1) + }); +} + +function browse(config = {}, retries = 2, error = null) { + if (retries === 0) { + return Promise.reject(error || new Error(`Failed browse request`)); + } + const page = config.page || 1; + const category = config.category; + const offset = (page - 1) * pageSize + 1; + + return singleRequest(`${baseUrl}/torrents/${category}/${offset}`) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1, err)); +} + +function singleRequest(requestUrl) { + const headers = { + 'user-agent': getRandomUserAgent(), + 'accept-encoding': 'gzip, deflate', + 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,lt;q=0.7,ar;q=0.6,fr;q=0.5,de;q=0.4' + }; + const options = { headers, timeout: defaultTimeout }; + + return axios.get(requestUrl, options) + .then(response => { + const body = response.data; + if (!body || !body.length) { + throw new Error(`No body: ${requestUrl} with status ${response.status}`); + } + return body; + }) + .catch(error => Promise.reject(error.message || error)); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('tbody tr').each((i, element) => { + const row = $(element); + const titleElement = row.find('td a'); + try { + torrents.push({ + name: titleElement.text().trim(), + torrentId: titleElement.attr('href').match(/torrent\/(.*)/)[1], + seeders: parseInt(row.find('span.seed_ok').first().text()), + }); + } catch (e) { + console.error('Failed parsing TorrentGalaxy row: ', e); + } + }); + + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + const details = $('.movie-detail'); + const magnetLink = details.find('a[href^="magnet"]').first().attr('href'); + const torrentLink = details.find('div.download-btn:nth-of-type(1) a').first().attr('href'); + const name = details.find('p strong').contents().filter((_, e) => e.type === 'text').text() || $('h5, h1').text(); + const languages = parse(name).languages; + const torrent = { + title: name.trim(), + infoHash: magnetLink ? decode(magnetLink).infoHash : undefined, + magnetLink: magnetLink, + torrentLink: torrentLink ? `${baseUrl}${torrentLink}` : undefined, + seeders: parseInt(details.find('.movie-information ul:nth-of-type(1) li:nth-of-type(3)').text(), 10), + category: details.find('ul:nth-of-type(4) a').attr('href').match(/\/(\w+)$/)[1], + size: parseSize(details.find('ul:nth-of-type(2) li:nth-of-type(3)').text()), + uploadDate: moment(details.find('ul:nth-of-type(3) li:nth-of-type(3)').text(), 'DD/MM/YYYY').toDate(), + languages: languages && languages.includes('french') ? undefined : 'french', + }; + resolve(torrent); + }); +} + +module.exports = { torrent, browse, Categories }; diff --git a/scraper/scrapers/torrent9/torrent9_scraper.js b/scraper/scrapers/torrent9/torrent9_scraper.js new file mode 100644 index 0000000..ac6fd79 --- /dev/null +++ b/scraper/scrapers/torrent9/torrent9_scraper.js @@ -0,0 +1,104 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const torrent9 = require('./torrent9_api'); +const { Type } = require('../../lib/types'); +const repository = require('../../lib/repository'); +const Promises = require('../../lib/promises'); +const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries'); + +const NAME = 'Torrent9'; +const TYPE_MAPPING = typeMapping(); + +const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 }); +const limiter = new Bottleneck({ maxConcurrent: 10 }); +const allowedCategories = [ + torrent9.Categories.MOVIE, + torrent9.Categories.TV, +]; +const clients = [ + torrent9 +]; + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function scrapeLatestTorrents() { + const scrapeFunctions = allowedCategories + .map(category => clients.map(client => () => scrapeLatestTorrentsForCategory(client, category))) + .reduce((a, b) => a.concat(b), []); + return Promises.sequence(scrapeFunctions) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(client, category, page = 1) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return api_limiter.schedule(() => client.browse({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then(results => Promise.all(results.map(r => limiter.schedule(() => processTorrentRecord(client, r))))) + .then(resolved => resolved.length > 0 && page < getUntilPage(category) + ? scrapeLatestTorrentsForCategory(client, category, page + 1) + : Promise.resolve([])); +} + +async function processTorrentRecord(client, record) { + if (await checkAndUpdateTorrent({ provider: NAME, torrentId: record.torrentId })) { + return record; + } + + const foundTorrent = await api_limiter.schedule(() => client.torrent(record.torrentId)).catch(() => undefined); + if (!foundTorrent || !foundTorrent.infoHash) { + console.warn(`Failed retrieving torrent ${record.torrentId}`); + return record; + } + + const torrent = { + provider: NAME, + infoHash: foundTorrent.infoHash, + magnetLink: foundTorrent.magnetLink, + torrentLink: foundTorrent.torrentLink, + torrentId: foundTorrent.torrentId, + title: foundTorrent.title, + type: TYPE_MAPPING[foundTorrent.category], + size: foundTorrent.size, + seeders: foundTorrent.seeders, + uploadDate: foundTorrent.uploadDate, + imdbId: foundTorrent.imdbId, + languages: foundTorrent.languages + }; + + if (await checkAndUpdateTorrent(torrent)) { + console.info(`Skipping torrent ${torrent.torrentId} - [${torrent.infoHash}] ${torrent.title}`); + return torrent; + } + + return createTorrentEntry(torrent).then(() => torrent); +} + +function typeMapping() { + const mapping = {}; + mapping[torrent9.Categories.MOVIE] = Type.MOVIE; + mapping[torrent9.Categories.TV] = Type.SERIES; + return mapping; +} + +function getUntilPage(category) { + if (category === torrent9.Categories.TV) { + return 2; + } + return 1; +} + +module.exports = { scrape, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/torrent9/torrent9v2_api.js b/scraper/scrapers/torrent9/torrent9v2_api.js new file mode 100644 index 0000000..e6ce580 --- /dev/null +++ b/scraper/scrapers/torrent9/torrent9v2_api.js @@ -0,0 +1,124 @@ +const axios = require('axios'); +const cheerio = require('cheerio'); +const moment = require('moment'); +const decode = require('magnet-uri'); +const { parse } = require('parse-torrent-title'); +const { getRandomUserAgent } = require('../../lib/requestHelper'); +const { parseSize } = require("../scraperHelper"); + +const baseUrl = 'https://www.torrent9.gg' +const defaultTimeout = 10000; + +const Categories = { + MOVIE: 'films', + TV: 'series', +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + + return singleRequest(`${baseUrl}/torrent/${torrentId}`) + .then((body) => parseTorrentPage(body)) + .then((torrent) => ({ torrentId, ...torrent })) + .catch((err) => { + console.warn(`Failed Torrent9 ${torrentId} request: `, err); + return torrent(torrentId, config, retries - 1) + }); +} + +function browse(config = {}, retries = 2, error = null) { + if (retries === 0) { + return Promise.reject(error || new Error(`Failed browse request`)); + } + const page = config.page || 1; + const category = config.category; + + return singleRequest(`${baseUrl}/torrents_${category}.html,page-${page}`) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1, err)); +} + +function singleRequest(requestUrl) { + const headers = { + 'user-agent': getRandomUserAgent(), + 'accept-encoding': 'gzip, deflate', + 'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,lt;q=0.7,ar;q=0.6,fr;q=0.5,de;q=0.4' + }; + const options = { headers, timeout: defaultTimeout }; + + return axios.get(requestUrl, options) + .then(response => { + const body = response.data; + if (!body || !body.length) { + throw new Error(`No body: ${requestUrl} with status ${response.status}`); + } + return body; + }) + .catch(error => Promise.reject(error.message || error)); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('tr').each((i, element) => { + const row = $(element); + const titleElement = row.find('td a'); + if (titleElement.length) { + torrents.push({ + title: titleElement.attr('title').trim(), + torrentId: titleElement.attr('href').match(/torrent\/(.*)/)[1], + seeders: parseInt(row.find('span.seed_ok').first().text()), + }); + } + }); + + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + const details = $('.movie-detail'); + const magnetLink = details.find('a[href^="magnet"]').first().attr('href'); + const name = getName(details) || $('h1').text(); + const languages = parse(name).languages; + const torrent = { + title: name.trim(), + infoHash: magnetLink ? decode(magnetLink).infoHash : undefined, + magnetLink: magnetLink, + seeders: parseInt(details.find('.movie-information ul:nth-of-type(1) li:nth-of-type(3)').text(), 10), + category: details.find('ul:nth-of-type(4) a').attr('href').match(/_(\w+)\.html$/)[1], + size: parseSize(details.find('ul:nth-of-type(2) li:nth-of-type(3)').text()), + uploadDate: moment(details.find('ul:nth-of-type(3) li:nth-of-type(3)').text(), 'DD/MM/YYYY').toDate(), + languages: languages && languages.includes('french') ? undefined : 'french', + }; + resolve(torrent); + }); +} + +function getName(details) { + const nameElement = details.find('p strong'); + if (nameElement.length === 1) { + return nameElement.contents().filter((_, elem) => elem.type === 'text').text() + } + const description = nameElement.parent().text(); + const nameMatch = description.match( + /(?:[A-Z]+[^A-Z0-9]*|[A-Z0-9-]+(?:[a-z]+\d+)?)\.([\w-]+\.){3,}\w+(?:-\w+)?(?=[A-Z])/); + return nameMatch && nameMatch[0]; +} + +module.exports = { torrent, browse, Categories }; diff --git a/scraper/scrapers/yts/yts_full_scraper.js b/scraper/scrapers/yts/yts_full_scraper.js new file mode 100644 index 0000000..b6af91d --- /dev/null +++ b/scraper/scrapers/yts/yts_full_scraper.js @@ -0,0 +1,15 @@ +const moment = require("moment"); +const yts = require('./yts_api'); +const scraper = require('./yts_scraper') + + +async function scrape() { + const scrapeStart = moment(); + console.log(`[${scrapeStart}] starting ${scraper.NAME} full scrape...`); + + return yts.maxPage() + .then(maxPage => scraper.scrape(maxPage)) + .then(() => console.log(`[${moment()}] finished ${scraper.NAME} full scrape`)); +} + +module.exports = { scrape, NAME: scraper.NAME }; \ No newline at end of file