From 6cc0c5dc640f2519adf305481131277b386614bf Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sun, 8 Mar 2020 20:06:32 +0100 Subject: [PATCH] adds kickass scrapper --- index.js | 17 +- lib/repository.js | 2 +- lib/torrentEntries.js | 14 +- scrapers/1337x/1337x_api.js | 18 +- scrapers/1337x/1337x_scraper.js | 46 +++-- scrapers/horriblesubs/horriblesubs_api.js | 12 +- scrapers/horriblesubs/horriblesubs_scraper.js | 33 +++- scrapers/kickass/kickass_api.js | 187 ++++++++++++++++++ scrapers/kickass/kickass_scraper.js | 87 ++++++++ scrapers/thepiratebay/thepiratebay_api.js | 17 +- scrapers/thepiratebay/thepiratebay_scraper.js | 27 ++- 11 files changed, 404 insertions(+), 56 deletions(-) create mode 100644 scrapers/kickass/kickass_api.js create mode 100644 scrapers/kickass/kickass_scraper.js diff --git a/index.js b/index.js index 369cba6..48eb057 100644 --- a/index.js +++ b/index.js @@ -5,21 +5,32 @@ const { connect } = require('./lib/repository'); const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper'); const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); const leetxScraper = require('./scrapers/1337x/1337x_scraper'); +const kickassScraper = require('./scrapers/kickass/kickass_scraper'); const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper'); const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); -const providers = [thepiratebayScraper]; +const providers = [ + // horribleSubsScraper, + // thepiratebayScraper, + kickassScraper, + // leetxScraper +]; async function scrape() { - providers.forEach((provider) => provider.scrape()); + return providers + .reduce((promise, scrapper) => promise.then(() => scrapper.scrape()), Promise.resolve()); } +server.get('/', function (req, res) { + res.send(200); +}); + server.post('/scrape', function (req, res) { scrape(); res.send(200); }); -server.listen(7000, async function () { +server.listen(process.env.PORT || 7000, async function () { await connect(); console.log('Scraper started'); scrape(); diff --git a/lib/repository.js b/lib/repository.js index 0142efa..7e7e3ee 100644 --- a/lib/repository.js +++ b/lib/repository.js @@ -69,7 +69,7 @@ function getProvider(provider) { } function updateProvider(provider) { - return Provider.update(provider); + return Provider.update(provider, { where: { name: { [Op.eq]: provider.name } } }); } function getTorrent(torrent) { diff --git a/lib/torrentEntries.js b/lib/torrentEntries.js index f31cd5d..fe48f69 100644 --- a/lib/torrentEntries.js +++ b/lib/torrentEntries.js @@ -47,4 +47,16 @@ async function getStoredTorrentEntry(torrent) { .catch(() => undefined); } -module.exports = { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry }; +async function updateTorrentSeeders(torrent) { + if (torrent.seeders === undefined) { + return; + } + + return repository.getTorrent(torrent) + .then(stored => { + stored.seeders = torrent.seeders; + return stored.save(); + }).catch(() => undefined); +} + +module.exports = { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders }; diff --git a/scrapers/1337x/1337x_api.js b/scrapers/1337x/1337x_api.js index 2930c37..4ad2823 100644 --- a/scrapers/1337x/1337x_api.js +++ b/scrapers/1337x/1337x_api.js @@ -6,7 +6,7 @@ const decode = require('magnet-uri'); const defaultProxies = [ 'https://1337x.to' ]; -const defaultTimeout = 30000; +const defaultTimeout = 10000; const Categories = { MOVIE: 'Movies', @@ -20,12 +20,12 @@ const Categories = { OTHER: 'Other', }; -function torrent(torrentSlug, config = {}, retries = 2) { - if (!torrentSlug || retries === 0) { - return Promise.reject(new Error(`Failed ${torrentSlug} query`)); +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); } const proxyList = config.proxyList || defaultProxies; - const slug = torrentSlug.startsWith('/torrent/') ? torrentSlug.replace('/torrent/', '') : torrentSlug; + const slug = torrentId.startsWith('/torrent/') ? torrentId.replace('/torrent/', '') : torrentId; return raceFirstSuccessful(proxyList .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${slug}`, config))) @@ -53,7 +53,7 @@ function browse(config = {}, retries = 2) { } const proxyList = config.proxyList || defaultProxies; const page = config.page || 1; - const category = config.category || 0; + const category = config.category; return raceFirstSuccessful(proxyList .map((proxyUrl) => singleRequest(`${proxyUrl}/cat/${category}/${page}/`, config))) @@ -92,7 +92,7 @@ function parseTableBody(body) { const row = $(element); torrents.push({ name: row.find('a').eq(1).text(), - slug: row.find('a').eq(1).attr('href').replace('/torrent/', ''), + torrentId: row.find('a').eq(1).attr('href').replace('/torrent/', ''), seeders: parseInt(row.children('td.coll-2').text()), leechers: parseInt(row.children('td.coll-3').text()), size: parseSize(row.children('td.coll-4').text()) @@ -116,9 +116,9 @@ function parseTorrentPage(body) { const imdbIdMatch = details.find('div[id=\'description\']').html().match(/imdb\.com\/title\/tt(\d+)/i); const torrent = { - name: decode(magnetLink).dn, + name: decode(magnetLink).name.replace(/\+/g, ' '), + infoHash: decode(magnetLink).infoHash, magnetLink: magnetLink, - infoHash: details.find('strong:contains(\'Infohash\')').next().text(), seeders: parseInt(details.find('strong:contains(\'Seeders\')').next().text(), 10), leechers: parseInt(details.find('strong:contains(\'Leechers\')').next().text(), 10), category: details.find('strong:contains(\'Category\')').next().text(), diff --git a/scrapers/1337x/1337x_scraper.js b/scrapers/1337x/1337x_scraper.js index f6f61ce..5aa6e26 100644 --- a/scrapers/1337x/1337x_scraper.js +++ b/scrapers/1337x/1337x_scraper.js @@ -3,16 +3,16 @@ const Bottleneck = require('bottleneck'); const leetx = require('./1337x_api'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); -const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); +const { + createTorrentEntry, + createSkipTorrentEntry, + getStoredTorrentEntry, + updateTorrentSeeders +} = require('../../lib/torrentEntries'); const NAME = '1337x'; const UNTIL_PAGE = 1; -const TYPE_MAPPING = { - 'Movies': Type.MOVIE, - 'Documentaries': Type.MOVIE, - 'TV': Type.SERIES, - 'Anime': Type.ANIME -}; +const TYPE_MAPPING = typeMapping(); const limiter = new Bottleneck({ maxConcurrent: 40 }); @@ -26,16 +26,21 @@ async function scrape() { .then(() => { lastScrape.lastScraped = scrapeStart; lastScrape.lastScrapedId = latestTorrents.length && latestTorrents[latestTorrents.length - 1].torrentId; - return lastScrape.save(); - }); + return repository.updateProvider(lastScrape); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } async function getLatestTorrents() { - const movies = await getLatestTorrentsForCategory(leetx.Categories.MOVIE); - const series = await getLatestTorrentsForCategory(leetx.Categories.TV); - const anime = await getLatestTorrentsForCategory(leetx.Categories.ANIME); - const docs = await getLatestTorrentsForCategory(leetx.Categories.DOCUMENTARIES); - return movies.concat(series).concat(anime).concat(docs); + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.TV, + leetx.Categories.ANIME, + leetx.Categories.DOCUMENTARIES + ]; + + return Promise.all(allowedCategories.map(category => getLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); } async function getLatestTorrentsForCategory(category, page = 1) { @@ -48,10 +53,10 @@ async function getLatestTorrentsForCategory(category, page = 1) { async function processTorrentRecord(record) { if (await getStoredTorrentEntry(record)) { - return; + return updateTorrentSeeders(record); } - const torrentFound = await leetx.torrent(record.slug).catch(() => undefined); + const torrentFound = await leetx.torrent(record.torrentId).catch(() => undefined); if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { return createSkipTorrentEntry(record); @@ -72,4 +77,13 @@ async function processTorrentRecord(record) { return createTorrentEntry(torrent); } +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.DOCUMENTARIES] = Type.MOVIE; + mapping[leetx.Categories.TV] = Type.SERIES; + mapping[leetx.Categories.ANIME] = Type.ANIME; + return mapping; +} + module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/horriblesubs/horriblesubs_api.js b/scrapers/horriblesubs/horriblesubs_api.js index 485e51d..83b15d0 100644 --- a/scrapers/horriblesubs/horriblesubs_api.js +++ b/scrapers/horriblesubs/horriblesubs_api.js @@ -3,7 +3,7 @@ const needle = require('needle'); const moment = require('moment'); const defaultUrl = 'https://horriblesubs.info'; -const defaultTimeout = 5000; +const defaultTimeout = 10000; function allShows(config = {}) { return _getContent('/shows', config) @@ -112,7 +112,15 @@ async function _findLatestEntry(entry, config) { reachedEnd = allEntries.length === 0; } - return foundEntry; + if (!foundEntry) { + return; + } + return { + title: foundEntry.title, + url: entry.urlEndpoint, + showId: showId, + singleEpisodes: [foundEntry] + }; } function _parseDate(date) { diff --git a/scrapers/horriblesubs/horriblesubs_scraper.js b/scrapers/horriblesubs/horriblesubs_scraper.js index 3cf691c..8829b31 100644 --- a/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scrapers/horriblesubs/horriblesubs_scraper.js @@ -1,4 +1,5 @@ const fs = require('fs'); +const moment = require('moment'); const Bottleneck = require('bottleneck'); const decode = require('magnet-uri'); const horriblesubs = require('./horriblesubs_api.js'); @@ -10,25 +11,43 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata'); const showMappings = require('./horriblesubs_mapping.json'); const NAME = 'HorribleSubs'; +const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days; const limiter = new Bottleneck({ maxConcurrent: 5 }); const entryLimiter = new Bottleneck({ maxConcurrent: 10 }); async function scrape() { - const lastScraped = await repository.getProvider({ name: NAME }); + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + const lastScraped = lastScrape.lastScraped && moment.unix(lastScrape.lastScraped); - if (!lastScraped.lastScraped) { - console.log(`${NAME}: no previous scrapping exist`); - //await _scrapeAllShows() + if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) { + console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`); + return _scrapeAllShows() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return repository.updateProvider(lastScrape); + }) + .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`)); + } else { + console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`); + return _scrapeLatestEntries() + .then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`)); } } +async function _scrapeLatestEntries() { + const latestEntries = await horriblesubs.getLatestEntries(); + + return Promise.all(latestEntries + .map((entryData) => limiter.schedule(() => _parseShowData(entryData) + .catch((err) => console.log(err))))); +} + async function _scrapeAllShows() { - console.log(`${NAME}: getting all shows...`); const shows = await horriblesubs.allShows(); return Promise.all(shows - .slice(0, 5) .map((show) => limiter.schedule(() => horriblesubs.showData(show) .then((showData) => _parseShowData(showData)) .catch((err) => console.log(err))))); @@ -94,7 +113,7 @@ async function _parseShowData(showData) { } // sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry - const kitsuIdsMapping = kitsuId.length && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId))) + const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId))) .then((metas) => metas.reduce((map, meta) => { const epOffset = Object.keys(map).length; [...Array(meta.totalCount).keys()] diff --git a/scrapers/kickass/kickass_api.js b/scrapers/kickass/kickass_api.js new file mode 100644 index 0000000..d7307de --- /dev/null +++ b/scrapers/kickass/kickass_api.js @@ -0,0 +1,187 @@ +const cheerio = require('cheerio'); +const needle = require('needle'); +const moment = require('moment'); +const decode = require('magnet-uri'); + +const defaultProxies = [ + 'https://katcr.co' +]; +const defaultTimeout = 10000; + +const Categories = { + MOVIE: 'movies', + TV: 'tv', + ANIME: 'anime', + APPS: 'applications', + GAMES: 'games', + MUSIC: 'music', + BOOKS: 'books', + PORN: 'xxx', + OTHER: 'other', +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + const proxyList = config.proxyList || defaultProxies; + + return raceFirstSuccessful(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => ({ torrentId, ...torrent })) + .catch((err) => torrent(torrentId, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + + return raceFirstSuccessful(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config))) + .then((body) => parseTableBody(body)) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + + return raceFirstSuccessful(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/category/${category}/page/${page}`, config))) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + + return needle('get', requestUrl, { open_timeout: timeout, follow: 2 }) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('Access Denied')) { + console.log(`Access Denied: ${requestUrl}`); + throw new Error(`Access Denied: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden') || + body.includes('Origin DNS error') || + !body.includes('Kickass Torrents')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('.table > tbody > tr').each((i, element) => { + const row = $(element); + torrents.push({ + torrentId: row.find('a[class="torrents_table__torrent_title"]').first().attr('href').replace('/torrent/', ''), + name: row.find('a[class="torrents_table__torrent_title"]').first().children('b').text(), + category: row.find('span[class="torrents_table__upload_info"]').first().children('a').first().attr('href') + .match(/category\/([^\/]+)/)[1], + seeders: parseInt(row.find('td[data-title="Seed"]').first().text()), + leechers: parseInt(row.find('td[data-title="Leech"]').first().text()), + size: parseSize(row.find('td[data-title="Size"]').first().text()), + uploadDate: moment(row.find('td[data-title="Age"]').first().attr('title')).toDate() + }); + }); + + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + const content = $('div[class="col"]').first(); + const info = content.find('div[class="torrent_stats"]').parent(); + const description = content.find('div[id="main"]'); + const magnetLink = info.find('a[title="Download verified Magnet"]').attr('href'); + const imdbIdMatch = description.html().match(/imdb\.com\/title\/tt(\d+)/i); + + const torrent = { + name: info.find('h1').first().text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + seeders: parseInt(info.find('span[class="torrent_stats__seed_count mr-2"]').first().text().match(/\d+/)[0], 10), + leechers: parseInt(info.find('span[class="torrent_stats__leech_count mr-2"]').first().text().match(/\d+/)[0], 10), + category: info.find('small').first().children('a').first().attr('href').match(/\/category\/([^\/]+)/)[1], + language: description.find('span:contains(\'Audio\')').next().children().eq(0).text(), + size: parseSize(description.find('ul[class="file_list"]').first().find('li').first().contents().eq(2).text() + .match(/\(Size: (.+)\)/)[1]), + uploadDate: moment(info.find('time').first().text()).toDate(), + imdbId: imdbIdMatch && `tt${imdbIdMatch[1].padStart(7, '0')}`, + files: content.find('ul[class="file_list"]').first().find('li > ul > li[class="file_list__file"]') + .map((i, elem) => $(elem)) + .map((i, ele) => ({ + fileIndex: i, + name: ele.find('span > ul > li').contents().eq(1).text().trim().replace(/^.+\//g, ''), + path: ele.find('span > ul > li').contents().eq(1).text().trim(), + size: parseSize(ele.contents().eq(2).text()) + })).get() + }; + if (torrent.files.length >= 50) { + // a max of 50 files are displayed on the page + delete torrent.files; + } + resolve(torrent); + }); +} + +function parseSize(sizeText) { + if (!sizeText) { + return undefined; + } + let scale = 1; + if (sizeText.includes('GB')) { + scale = 1024 * 1024 * 1024 + } else if (sizeText.includes('MB')) { + scale = 1024 * 1024; + } else if (sizeText.includes('KB') || sizeText.includes('kB')) { + scale = 1024; + } + return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale); +} + +function raceFirstSuccessful(promises) { + return Promise.all(promises.map((p) => { + // If a request fails, count that as a resolution so it will keep + // waiting for other possible successes. If a request succeeds, + // treat it as a rejection so Promise.all immediately bails out. + return p.then( + (val) => Promise.reject(val), + (err) => Promise.resolve(err) + ); + })).then( + // If '.all' resolved, we've just got an array of errors. + (errors) => Promise.reject(errors), + // If '.all' rejected, we've got the result we wanted. + (val) => Promise.resolve(val) + ); +} + +module.exports = { torrent, search, browse, Categories }; diff --git a/scrapers/kickass/kickass_scraper.js b/scrapers/kickass/kickass_scraper.js new file mode 100644 index 0000000..6a8317d --- /dev/null +++ b/scrapers/kickass/kickass_scraper.js @@ -0,0 +1,87 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const kickass = require('./kickass_api'); +const { Type } = require('../../lib/types'); +const repository = require('../../lib/repository'); +const { + createTorrentEntry, + createSkipTorrentEntry, + getStoredTorrentEntry, + updateTorrentSeeders +} = require('../../lib/torrentEntries'); + +const NAME = 'KickassTorrents'; +const UNTIL_PAGE = 1; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 40 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + const latestTorrents = await getLatestTorrents(); + return Promise.all(latestTorrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))) + .then(() => { + lastScrape.lastScraped = scrapeStart; + lastScrape.lastScrapedId = latestTorrents.length && latestTorrents[latestTorrents.length - 1].torrentId; + return repository.updateProvider(lastScrape); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function getLatestTorrents() { + const allowedCategories = [ + kickass.Categories.MOVIE, + kickass.Categories.TV, + kickass.Categories.ANIME, + ]; + + return Promise.all(allowedCategories.map(category => getLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function getLatestTorrentsForCategory(category, page = 1) { + return kickass.browse(({ category, page })) + .then(torrents => torrents.length && page < UNTIL_PAGE + ? getLatestTorrents(category, page + 1).then(nextTorrents => torrents.concat(nextTorrents)) + : torrents) + .catch(() => []); +} + +async function processTorrentRecord(record) { + if (await getStoredTorrentEntry(record)) { + return updateTorrentSeeders(record); + } + + const torrentFound = await kickass.torrent(record.torrentId).catch(() => undefined); + + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return createSkipTorrentEntry(record); + } + + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + title: torrentFound.name.replace(/\t|\s+/g, ' '), + size: torrentFound.size, + type: TYPE_MAPPING[torrentFound.category], + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + }; + + return createTorrentEntry(torrent); +} + +function typeMapping() { + const mapping = {}; + mapping[kickass.Categories.MOVIE] = Type.MOVIE; + mapping[kickass.Categories.TV] = Type.SERIES; + mapping[kickass.Categories.ANIME] = Type.ANIME; + return mapping; +} + +module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/thepiratebay/thepiratebay_api.js b/scrapers/thepiratebay/thepiratebay_api.js index 638b209..2e483d6 100644 --- a/scrapers/thepiratebay/thepiratebay_api.js +++ b/scrapers/thepiratebay/thepiratebay_api.js @@ -9,7 +9,7 @@ const defaultProxies = [ 'https://piratebays.cool', 'https://piratebays.life']; const dumpUrl = '/static/dump/csv/'; -const defaultTimeout = 30000; +const defaultTimeout = 10000; const Categories = { AUDIO: { @@ -219,16 +219,15 @@ function parseSize(sizeText) { if (!sizeText) { return undefined; } + let scale = 1; if (sizeText.includes('GiB')) { - return Math.floor(parseFloat(sizeText.trim()) * 1024 * 1024 * 1024); + scale = 1024 * 1024 * 1024 + } else if (sizeText.includes('MiB')) { + scale = 1024 * 1024; + } else if (sizeText.includes('KiB')) { + scale = 1024; } - if (sizeText.includes('MiB')) { - return Math.floor(parseFloat(sizeText.trim()) * 1024 * 1024); - } - if (sizeText.includes('KiB')) { - return Math.floor(parseFloat(sizeText.trim()) * 1024); - } - return Math.floor(parseFloat(sizeText)); + return Math.floor(parseFloat(sizeText) * scale); } function raceFirstSuccessful(promises) { diff --git a/scrapers/thepiratebay/thepiratebay_scraper.js b/scrapers/thepiratebay/thepiratebay_scraper.js index c857b03..d490606 100644 --- a/scrapers/thepiratebay/thepiratebay_scraper.js +++ b/scrapers/thepiratebay/thepiratebay_scraper.js @@ -3,10 +3,15 @@ const Bottleneck = require('bottleneck'); const thepiratebay = require('./thepiratebay_api.js'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); -const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); +const { + createTorrentEntry, + createSkipTorrentEntry, + getStoredTorrentEntry, + updateTorrentSeeders +} = require('../../lib/torrentEntries'); const NAME = 'ThePirateBay'; -const UNTIL_PAGE = 1; +const UNTIL_PAGE = 20; const limiter = new Bottleneck({ maxConcurrent: 40 }); @@ -33,21 +38,27 @@ async function scrape() { .then(() => { lastScrape.lastScraped = scrapeStart; lastScrape.lastScrapedId = latestTorrents.length && latestTorrents[latestTorrents.length - 1].torrentId; - return lastScrape.save(); - }); + return repository.updateProvider(lastScrape); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } -async function getLatestTorrents(page = 0) { - return thepiratebay.browse(({ category: thepiratebay.Categories.VIDEO.ALL, page: page })) +async function getLatestTorrents() { + return Promise.all(allowedCategories.map(category => getLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function getLatestTorrentsForCategory(category, page = 0) { + return thepiratebay.browse(({ category, page })) .then(torrents => torrents.length && page < UNTIL_PAGE - ? getLatestTorrents(page + 1).then(nextTorrents => torrents.concat(nextTorrents)) + ? getLatestTorrents(category, page + 1).then(nextTorrents => torrents.concat(nextTorrents)) : torrents) .catch(() => []); } async function processTorrentRecord(record) { if (await getStoredTorrentEntry(record)) { - return; + return updateTorrentSeeders(record); } const torrentFound = await thepiratebay.torrent(record.torrentId).catch(() => undefined);