From 31b0c85fcf2862be06d2aa2643633e45d5467de1 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Mon, 1 Feb 2021 19:46:46 +0100 Subject: [PATCH] [scraper] clean up torrent galaxy provider --- scraper/scrapers/1337x/1337x_api.js | 2 +- .../torrentgalaxy/torrentgalaxy_api.js | 49 ++++++++++--------- .../torrentgalaxy/torrentgalaxy_scraper.js | 18 ++++--- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/scraper/scrapers/1337x/1337x_api.js b/scraper/scrapers/1337x/1337x_api.js index 52bad3a..4877501 100644 --- a/scraper/scrapers/1337x/1337x_api.js +++ b/scraper/scrapers/1337x/1337x_api.js @@ -4,7 +4,7 @@ const Sugar = require('sugar-date'); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); const { escapeHTML } = require('../../lib/metadata'); -const { getRandomUserAgent, defaultOptionsWithProxy } = require('../../lib/requestHelper'); +const { getRandomUserAgent } = require('../../lib/requestHelper'); const defaultProxies = [ 'https://1337x.to' diff --git a/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js b/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js index 350ed57..cd57181 100644 --- a/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js +++ b/scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js @@ -3,11 +3,12 @@ const needle = require('needle'); const moment = require('moment'); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); +const { getRandomUserAgent } = require('../../lib/requestHelper'); const defaultProxies = [ - 'https://torrentgalaxy.to', + // 'https://torrentgalaxy.to', // 'https://torrentgalaxy.mx', - // 'https://torrentgalaxy.su' + 'https://torrentgalaxy.su' ]; const defaultTimeout = 10000; @@ -53,9 +54,9 @@ function search(keyword, config = {}, retries = 2) { .catch(() => search(keyword, config, retries - 1)); } -function browse(config = {}, retries = 2) { +function browse(config = {}, retries = 2, error = null) { if (retries === 0) { - return Promise.reject(new Error(`Failed browse request`)); + return Promise.reject(error || new Error(`Failed browse request`)); } const proxyList = config.proxyList || defaultProxies; const page = config.page || 1; @@ -64,17 +65,17 @@ function browse(config = {}, retries = 2) { return Promises.first(proxyList .map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}`))) .then((body) => parseTableBody(body)) - .catch(() => browse(config, retries - 1)); + .catch((err) => browse(config, retries - 1, err)); } function singleRequest(requestUrl) { - const options = { open_timeout: defaultTimeout, follow: 2 }; + const options = { userAgent: getRandomUserAgent(), open_timeout: defaultTimeout, follow: 2 }; return needle('get', requestUrl, options) .then((response) => { const body = response.body; if (!body) { - throw new Error(`No body: ${requestUrl}`); + throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`); } else if (body.includes('Access Denied')) { console.log(`Access Denied: ${requestUrl}`); throw new Error(`Access Denied: ${requestUrl}`); @@ -102,21 +103,25 @@ function parseTableBody(body) { const row = $(element); const magnetLink = row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(2)').attr('href'); const imdbIdMatch = row.html().match(/search=(tt\d+)/i); - torrents.push({ - name: row.find('.tgxtablecell div a[title]').first().text(), - infoHash: decode(magnetLink).infoHash, - magnetLink: magnetLink, - torrentLink: row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(1)').first().attr('href'), - torrentId: row.find('.tgxtablecell div a[title]').first().attr('href').match(/torrent\/(\d+)/)[1], - verified: !!row.find('i.fa-check').length, - category: row.find('div:nth-of-type(n+2) .shrink a').first().attr('href').match(/cat=(\d+)$/)[1], - seeders: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'green\'] b').first().text()), - leechers: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'#ff0000\'] b').first().text()), - languages: row.find('.tgxtablecell img[title]').first().attr('title'), - size: parseSize(row.find('.collapsehide span.badge-secondary').first().text()), - uploadDate: parseDate(row.find('div.collapsehide:nth-of-type(12)').first().text()), - imdbId: imdbIdMatch && imdbIdMatch[1], - }); + try { + torrents.push({ + name: row.find('.tgxtablecell div a[title]').first().text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + torrentLink: row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(1)').first().attr('href'), + torrentId: row.find('.tgxtablecell div a[title]').first().attr('href').match(/torrent\/(\d+)/)[1], + verified: !!row.find('i.fa-check').length, + category: row.find('div:nth-of-type(n+2) .shrink a').first().attr('href').match(/cat=(\d+)$/)[1], + seeders: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'green\'] b').first().text()), + leechers: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'#ff0000\'] b').first().text()), + languages: row.find('.tgxtablecell img[title]').first().attr('title'), + size: parseSize(row.find('.collapsehide span.badge-secondary').first().text()), + uploadDate: parseDate(row.find('div.collapsehide:nth-of-type(12)').first().text()), + imdbId: imdbIdMatch && imdbIdMatch[1], + }); + } catch (e) { + console.error('Failed parsing TorrentGalaxy row: ', e); + } }); resolve(torrents); diff --git a/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js b/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js index 409a450..35c5603 100644 --- a/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js +++ b/scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js @@ -9,6 +9,7 @@ const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrent const NAME = 'TorrentGalaxy'; const TYPE_MAPPING = typeMapping(); +const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 }); const limiter = new Bottleneck({ maxConcurrent: 10 }); const allowedCategories = [ torrentGalaxy.Categories.ANIME, @@ -21,7 +22,6 @@ const allowedCategories = [ torrentGalaxy.Categories.TV_SD, torrentGalaxy.Categories.TV_HD, torrentGalaxy.Categories.TV_PACKS, - torrentGalaxy.Categories.TV_SPORT, torrentGalaxy.Categories.DOCUMENTARIES, ]; const packCategories = [ @@ -57,7 +57,7 @@ async function scrapeLatestTorrents() { async function scrapeLatestTorrentsForCategory(category, page = 1) { console.log(`Scrapping ${NAME} ${category} category page ${page}`); - return torrentGalaxy.browse(({ category, page })) + return api_limiter.schedule(() => torrentGalaxy.browse({ category, page })) .catch(error => { console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); return Promise.resolve([]); @@ -73,10 +73,6 @@ async function processTorrentRecord(record) { return Promise.resolve('Invalid torrent record'); } - if (await checkAndUpdateTorrent(record)) { - return record; - } - const torrent = { provider: NAME, infoHash: record.infoHash, @@ -92,6 +88,15 @@ async function processTorrentRecord(record) { languages: !(record.languages || '').includes('Other') ? record.languages : undefined }; + if (await checkAndUpdateTorrent(torrent)) { + return torrent; + } + const isOld = moment(torrent.uploadDate).isBefore(moment().subtract(18, 'month')); + if (torrent.seeders === 0 && isOld && !torrent.pack) { + console.log(`Skipping old unseeded torrent [${torrent.infoHash}] ${torrent.title}`) + return torrent; + } + return createTorrentEntry(torrent).then(() => torrent); } @@ -116,7 +121,6 @@ function getMaxPage(category) { switch (category) { case torrentGalaxy.Categories.TV_SD: case torrentGalaxy.Categories.TV_HD: - return 10; case torrentGalaxy.Categories.MOVIE_SD: case torrentGalaxy.Categories.MOVIE_HD: return 5;