From d7313ecb917a61dea8ff5424eb4a1eff27bb9fef Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sun, 31 Jan 2021 21:09:52 +0100 Subject: [PATCH] [scraper] add torrent galaxy provider --- scraper/scheduler/scrapers.js | 4 +- .../torrentgalaxy/torrentgalaxy_api.js | 180 ++++++++++++++++++ .../torrentgalaxy/torrentgalaxy_scraper.js | 128 +++++++++++++ 3 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js create mode 100644 scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js index 942741f..3641e95 100644 --- a/scraper/scheduler/scrapers.js +++ b/scraper/scheduler/scrapers.js @@ -1,5 +1,4 @@ const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper'); -const horribleSubsScraper = require('../scrapers/horriblesubs/horriblesubs_scraper'); const ytsScraper = require('../scrapers/yts/yts_scraper'); const eztvScraper = require('../scrapers/eztv/eztv_scraper'); const leetxScraper = require('../scrapers/1337x/1337x_scraper'); @@ -7,16 +6,17 @@ const kickassScraper = require('../scrapers/kickass/kickass_scraper'); const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper'); const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper'); const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper'); +const torrentGalaxySiScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper'); module.exports = [ { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: eztvScraper, name: eztvScraper.NAME, cron: '0 0 */4 ? * *' }, - // { scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' }, { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' }, { scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' }, + { scraper: torrentGalaxySiScraper, name: torrentGalaxySiScraper.NAME, cron: '0 0 */4 ? * *' }, // { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' }, // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') } // { scraper: require('../scrapers/1337x/1337x_search_scraper') } diff --git a/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js b/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js new file mode 100644 index 0000000..350ed57 --- /dev/null +++ b/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js @@ -0,0 +1,180 @@ +const cheerio = require('cheerio'); +const needle = require('needle'); +const moment = require('moment'); +const decode = require('magnet-uri'); +const Promises = require('../../lib/promises'); + +const defaultProxies = [ + 'https://torrentgalaxy.to', + // 'https://torrentgalaxy.mx', + // 'https://torrentgalaxy.su' +]; +const defaultTimeout = 10000; + +const Categories = { + ANIME: '28', + MOVIE_4K: '3', + MOVIE_PACKS: '4', + MOVIE_SD: '1', + MOVIE_HD: '42', + MOVIE_CAM: '45', + MOVIE_BOLLYWOOD: '46', + TV_SD: '5', + TV_HD: '41', + TV_PACKS: '6', + TV_SPORT: '7', + DOCUMENTARIES: '9' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + const proxyList = config.proxyList || defaultProxies; + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => ({ torrentId, ...torrent })) + .catch((err) => torrent(torrentId, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}&search=${keyword}`))) + .then((body) => parseTableBody(body)) + .catch(() => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}`))) + .then((body) => parseTableBody(body)) + .catch(() => browse(config, retries - 1)); +} + +function singleRequest(requestUrl) { + const options = { open_timeout: defaultTimeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('Access Denied')) { + console.log(`Access Denied: ${requestUrl}`); + throw new Error(`Access Denied: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden') || + body.includes('Origin DNS error')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('.tgxtable > div').each((i, element) => { + if (i === 0) return; + const row = $(element); + const magnetLink = row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(2)').attr('href'); + const imdbIdMatch = row.html().match(/search=(tt\d+)/i); + torrents.push({ + name: row.find('.tgxtablecell div a[title]').first().text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + torrentLink: row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(1)').first().attr('href'), + torrentId: row.find('.tgxtablecell div a[title]').first().attr('href').match(/torrent\/(\d+)/)[1], + verified: !!row.find('i.fa-check').length, + category: row.find('div:nth-of-type(n+2) .shrink a').first().attr('href').match(/cat=(\d+)$/)[1], + seeders: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'green\'] b').first().text()), + leechers: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'#ff0000\'] b').first().text()), + languages: row.find('.tgxtablecell img[title]').first().attr('title'), + size: parseSize(row.find('.collapsehide span.badge-secondary').first().text()), + uploadDate: parseDate(row.find('div.collapsehide:nth-of-type(12)').first().text()), + imdbId: imdbIdMatch && imdbIdMatch[1], + }); + }); + + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + const content = $('div[class="torrentpagetable limitwidth"]').first(); + const magnetLink = $('a[class="btn btn-danger"]').attr('href'); + const imdbIdContent = $('a[title="IMDB link"]').attr('href'); + const imdbIdMatch = imdbIdContent && imdbIdContent.match(/imdb\.com\/title\/(tt\d+)/i); + + const torrent = { + name: content.find('.linebreakup a').first().text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + verified: !content.find('i.fa-exclamation-triangle').length, + torrentLink: $('a[class="btn btn-success"]').attr('href'), + seeders: parseInt(content.find('font[color=\'green\']').first().text(), 10), + category: content.find('div:nth-of-type(4) a:nth-of-type(2)').first().attr('href').match(/cat=(\d+)$/)[1], + languages: content.find('div:nth-of-type(5) div:nth-of-type(2)').first().text().trim(), + size: parseSize(content.find('div:nth-of-type(6) div:nth-of-type(2)').first().text()), + uploadDate: parseDate(content.find('div:nth-of-type(9) div:nth-of-type(2)').first().text()), + imdbId: imdbIdMatch && imdbIdMatch[1], + }; + resolve(torrent); + }); +} + +function parseSize(sizeText) { + if (!sizeText) { + return undefined; + } + let scale = 1; + if (sizeText.includes('GB')) { + scale = 1024 * 1024 * 1024 + } else if (sizeText.includes('MB')) { + scale = 1024 * 1024; + } else if (sizeText.includes('KB') || sizeText.includes('kB')) { + scale = 1024; + } + return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale); +} + +function parseDate(dateString) { + if (dateString.includes('ago')) { + const amount = parseInt(dateString, 10); + const unit = dateString.includes('Min') ? 'minutes' : 'hours'; + return moment().subtract(amount, unit).toDate(); + } + const preparedDate = dateString.replace(/\//g, '-').replace(/-(\d{2})\s/, '-20$1 ') + return moment(preparedDate, 'DD-MM-YYYY HH:mm').toDate(); +} + +module.exports = { torrent, search, browse, Categories }; diff --git a/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js b/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js new file mode 100644 index 0000000..409a450 --- /dev/null +++ b/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js @@ -0,0 +1,128 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const torrentGalaxy = require('./torrentgalaxy_api'); +const { Type } = require('../../lib/types'); +const repository = require('../../lib/repository'); +const Promises = require('../../lib/promises'); +const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries'); + +const NAME = 'TorrentGalaxy'; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 10 }); +const allowedCategories = [ + torrentGalaxy.Categories.ANIME, + torrentGalaxy.Categories.MOVIE_4K, + torrentGalaxy.Categories.MOVIE_PACKS, + torrentGalaxy.Categories.MOVIE_SD, + torrentGalaxy.Categories.MOVIE_HD, + torrentGalaxy.Categories.MOVIE_CAM, + torrentGalaxy.Categories.MOVIE_BOLLYWOOD, + torrentGalaxy.Categories.TV_SD, + torrentGalaxy.Categories.TV_HD, + torrentGalaxy.Categories.TV_PACKS, + torrentGalaxy.Categories.TV_SPORT, + torrentGalaxy.Categories.DOCUMENTARIES, +]; +const packCategories = [ + torrentGalaxy.Categories.MOVIE_PACKS, + torrentGalaxy.Categories.TV_PACKS +]; + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + // const ids = ['14212584']; + // return Promise.all(ids.map(id => limiter.schedule(() => torrentGalaxy.torrent(id) + // .then(torrent => processTorrentRecord(torrent))))) + // .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => torrentGalaxy.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return torrentGalaxy.browse(({ category, page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) + .then(resolved => resolved.length > 0 && page < getMaxPage(category) + ? scrapeLatestTorrentsForCategory(category, page + 1) + : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (!record || !TYPE_MAPPING[record.category] || !record.verified) { + return Promise.resolve('Invalid torrent record'); + } + + if (await checkAndUpdateTorrent(record)) { + return record; + } + + const torrent = { + provider: NAME, + infoHash: record.infoHash, + torrentId: record.torrentId, + torrentLink: record.torrentLink, + title: record.name.replace(/\t|\s+/g, ' '), + type: TYPE_MAPPING[record.category], + size: record.size, + seeders: record.seeders, + uploadDate: record.uploadDate, + imdbId: record.imdbId, + pack: packCategories.includes(record.category), + languages: !(record.languages || '').includes('Other') ? record.languages : undefined + }; + + return createTorrentEntry(torrent).then(() => torrent); +} + +function typeMapping() { + const mapping = {}; + mapping[torrentGalaxy.Categories.MOVIE_SD] = Type.MOVIE; + mapping[torrentGalaxy.Categories.MOVIE_HD] = Type.MOVIE; + mapping[torrentGalaxy.Categories.MOVIE_4K] = Type.MOVIE; + mapping[torrentGalaxy.Categories.MOVIE_CAM] = Type.MOVIE; + mapping[torrentGalaxy.Categories.MOVIE_PACKS] = Type.MOVIE; + mapping[torrentGalaxy.Categories.MOVIE_BOLLYWOOD] = Type.MOVIE; + mapping[torrentGalaxy.Categories.DOCUMENTARIES] = Type.MOVIE; + mapping[torrentGalaxy.Categories.TV_SD] = Type.SERIES; + mapping[torrentGalaxy.Categories.TV_HD] = Type.SERIES; + mapping[torrentGalaxy.Categories.TV_PACKS] = Type.SERIES; + mapping[torrentGalaxy.Categories.TV_SPORT] = Type.SERIES; + mapping[torrentGalaxy.Categories.ANIME] = Type.ANIME; + return mapping; +} + +function getMaxPage(category) { + switch (category) { + case torrentGalaxy.Categories.TV_SD: + case torrentGalaxy.Categories.TV_HD: + return 10; + case torrentGalaxy.Categories.MOVIE_SD: + case torrentGalaxy.Categories.MOVIE_HD: + return 5; + default: + return 1; + } +} + +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file