From e71cce0be8a6a305cdb6012b37d85d0ac80337f5 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Fri, 18 Feb 2022 19:41:01 +0100 Subject: [PATCH] add yts full scan scraper --- scraper/scheduler/scrapers.js | 2 ++ scraper/scrapers/yts/yts_api.js | 8 +++++++- scraper/scrapers/yts/yts_full_scraper.js | 15 +++++++++++++++ scraper/scrapers/yts/yts_scraper.js | 14 +++++--------- 4 files changed, 29 insertions(+), 10 deletions(-) create mode 100644 scraper/scrapers/yts/yts_full_scraper.js diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js index 860cfe4..cf931fa 100644 --- a/scraper/scheduler/scrapers.js +++ b/scraper/scheduler/scrapers.js @@ -1,6 +1,7 @@ const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper'); const thepiratebayFakeRemoval = require('../scrapers/thepiratebay/thepiratebay_fakes_removal'); const ytsScraper = require('../scrapers/yts/yts_scraper'); +const ytsFullScraper = require('../scrapers/yts/yts_full_scraper'); const eztvScraper = require('../scrapers/eztv/eztv_scraper'); const leetxScraper = require('../scrapers/1337x/1337x_scraper'); const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper'); @@ -19,6 +20,7 @@ const torrent9Scraper = require('../scrapers/torrent9/torrent9_scraper'); module.exports = [ { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' }, + { scraper: ytsFullScraper, name: ytsFullScraper.NAME, cron: '0 0 0 * * 0' }, { scraper: eztvScraper, name: eztvScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' }, diff --git a/scraper/scrapers/yts/yts_api.js b/scraper/scrapers/yts/yts_api.js index 05b774b..3e77095 100644 --- a/scraper/scrapers/yts/yts_api.js +++ b/scraper/scrapers/yts/yts_api.js @@ -42,6 +42,12 @@ function browse(config = {}, retries = 2) { .catch(error => browse(config, retries - 1)); } +function maxPage() { + return Promises.first(defaultProxies + .map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/list_movies.json?limit=${limit}`))) + .then(results => Math.round((results?.data?.movie_count || 0) / limit)) +} + function singleRequest(requestUrl, config = {}) { const timeout = config.timeout || defaultTimeout; const options = { headers: { 'User-Agent': getRandomUserAgent() }, timeout: timeout }; @@ -89,4 +95,4 @@ function formatType(type) { return type.toUpperCase(); } -module.exports = { torrent, search, browse }; \ No newline at end of file +module.exports = { torrent, search, browse, maxPage }; \ No newline at end of file diff --git a/scraper/scrapers/yts/yts_full_scraper.js b/scraper/scrapers/yts/yts_full_scraper.js new file mode 100644 index 0000000..b6af91d --- /dev/null +++ b/scraper/scrapers/yts/yts_full_scraper.js @@ -0,0 +1,15 @@ +const moment = require("moment"); +const yts = require('./yts_api'); +const scraper = require('./yts_scraper') + + +async function scrape() { + const scrapeStart = moment(); + console.log(`[${scrapeStart}] starting ${scraper.NAME} full scrape...`); + + return yts.maxPage() + .then(maxPage => scraper.scrape(maxPage)) + .then(() => console.log(`[${moment()}] finished ${scraper.NAME} full scrape`)); +} + +module.exports = { scrape, NAME: scraper.NAME }; \ No newline at end of file diff --git a/scraper/scrapers/yts/yts_scraper.js b/scraper/scrapers/yts/yts_scraper.js index 50ceb7f..7e26620 100644 --- a/scraper/scrapers/yts/yts_scraper.js +++ b/scraper/scrapers/yts/yts_scraper.js @@ -10,12 +10,12 @@ const UNTIL_PAGE = 10; const limiter = new Bottleneck({ maxConcurrent: 10 }); -async function scrape() { +async function scrape(maxPage) { const scrapeStart = moment(); const lastScrape = await repository.getProvider({ name: NAME }); console.log(`[${scrapeStart}] starting ${NAME} scrape...`); - return scrapeLatestTorrents() + return scrapeLatestTorrentsForCategory(maxPage) .then(() => { lastScrape.lastScraped = scrapeStart; return lastScrape.save(); @@ -27,11 +27,7 @@ async function updateSeeders(torrent) { return limiter.schedule(() => yts.torrent(torrent.torrentId)); } -async function scrapeLatestTorrents() { - return scrapeLatestTorrentsForCategory(); -} - -async function scrapeLatestTorrentsForCategory(page = 1) { +async function scrapeLatestTorrentsForCategory(maxPage = UNTIL_PAGE, page = 1) { console.log(`Scrapping ${NAME} page ${page}`); return yts.browse(({ page })) .catch(error => { @@ -39,8 +35,8 @@ async function scrapeLatestTorrentsForCategory(page = 1) { return Promise.resolve([]); }) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) - .then(resolved => resolved.length > 0 && page < UNTIL_PAGE - ? scrapeLatestTorrentsForCategory(page + 1) + .then(resolved => resolved.length > 0 && page < maxPage + ? scrapeLatestTorrentsForCategory(maxPage, page + 1) : Promise.resolve()); }