diff --git a/scraper/index.js b/scraper/index.js index 94a3aa1..4a498c5 100644 --- a/scraper/index.js +++ b/scraper/index.js @@ -2,7 +2,7 @@ require('dotenv').config(); const express = require("express"); const server = express(); const schedule = require('node-schedule'); -const { connect } = require('./lib/repository'); +const { connect, getUpdateSeedersTorrents } = require('./lib/repository'); const realDebrid = require('./moch/realdebrid'); const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper'); const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); @@ -24,6 +24,7 @@ const PROVIDERS = [ // thepiratebayUnofficialDumpScraper ]; const SCRAPE_CRON = process.env.SCRAPE_CRON || '0 0 */4 ? * *'; +const SEEDERS_CRON = '*/60 * * ? * *'; async function scrape() { return PROVIDERS @@ -36,9 +37,19 @@ async function scrape() { }, Promise.resolve()); } +async function updateSeeders() { + return getUpdateSeedersTorrents() + .then(torrents => Promise.all(torrents + .map(torrent => PROVIDERS.find(provider => provider.NAME === torrent.provider) + .updateSeeders(torrent)))); +} + function enableScheduling() { if (process.env.ENABLE_SCHEDULING) { - schedule.scheduleJob(SCRAPE_CRON, () => scrape().catch(error => console.error('Failed scraping: ', error))); + schedule.scheduleJob(SCRAPE_CRON, + () => scrape().catch(error => console.error('Failed scraping: ', error))); + // schedule.scheduleJob(SEEDERS_CRON, + // () => updateSeeders().catch(error => console.error('Failed update seeders: ', error))); } else { scrape().catch(error => console.error('Failed scraping: ', error)); } diff --git a/scraper/lib/cache.js b/scraper/lib/cache.js index 5c6259a..381d308 100644 --- a/scraper/lib/cache.js +++ b/scraper/lib/cache.js @@ -8,12 +8,14 @@ const METADATA_PREFIX = `${GLOBAL_KEY_PREFIX}|metadata`; const RESOLVED_URL_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|moch`; const PROXY_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|proxy`; const USER_AGENT_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|agent`; +const TRACKERS_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|trackers`; const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days const MEMORY_TTL = process.env.METADATA_TTL || 2 * 60 * 60; // 2 hours const RESOLVED_URL_TTL = 2 * 60; // 2 minutes const PROXY_TTL = 60 * 60; // 60 minutes const USER_AGENT_TTL = 2 * 24 * 60 * 60; // 2 days +const TRACKERS_TTL = 2 * 24 * 60 * 60; // 2 days const MONGO_URI = process.env.MONGODB_URI; @@ -75,12 +77,17 @@ function cacheUserAgent(id, method) { return cacheWrap(memoryCache, `${USER_AGENT_KEY_PREFIX}:${id}`, method, { ttl: { USER_AGENT_TTL } }); } +function cacheTrackers(method) { + return cacheWrap(memoryCache, `${TRACKERS_KEY_PREFIX}`, method, { ttl: { TRACKERS_TTL } }); +} + module.exports = { cacheWrapImdbId, cacheWrapKitsuId, cacheWrapMetadata, cacheWrapResolvedUrl, cacheWrapProxy, - cacheUserAgent + cacheUserAgent, + cacheTrackers }; diff --git a/scraper/lib/repository.js b/scraper/lib/repository.js index 1a79f94..a8ab7c2 100644 --- a/scraper/lib/repository.js +++ b/scraper/lib/repository.js @@ -1,3 +1,4 @@ +const moment = require('moment'); const { Sequelize, fn, col, literal } = require('sequelize'); const Op = Sequelize.Op; @@ -119,11 +120,18 @@ function getTorrentsBasedOnTitle(titleQuery, type) { function getTorrentsWithoutSize() { return Torrent.findAll({ where: literal( - 'exists (select 1 from files where files."infoHash" = torrent."infoHash" and files.size = 300000000) and random() < 0.01'), + 'exists (select 1 from files where files."infoHash" = torrent."infoHash" and files.size = 300000000)'), order: [ ['seeders', 'DESC'] - ], - limit: 1000 + ] + }); +} + +function getUpdateSeedersTorrents() { + const until = moment().subtract(7, 'days').format('YYYY-MM-DD'); + return Torrent.findAll({ + where: literal(`torrent."updatedAt" < \'${until}\' and random() < 0.001`), + limit: 100 }); } @@ -186,6 +194,7 @@ module.exports = { createTorrent, getTorrent, getTorrentsBasedOnTitle, + getUpdateSeedersTorrents, createFile, getFiles, getFilesBasedOnTitle, diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index 43e1896..79014df 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -4,20 +4,22 @@ const parseTorrent = require('parse-torrent'); const async = require('async'); const decode = require('magnet-uri'); const isVideo = require('./video'); +const { cacheTrackers } = require('./cache'); +const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_best.txt'; const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs module.exports.updateCurrentSeeders = function (torrent) { - return new Promise((resolve) => { - if (!torrent.magnetLink) { + return new Promise(async (resolve) => { + if (!torrent.magnetLink && !torrent.infoHash) { return resolve(0); } const Tracker = require("peer-search/tracker"); const seeders = {}; - const decodedMagnetLink = decode(torrent.magnetLink); - const trackers = decodedMagnetLink && decodedMagnetLink.tr; + const decodedMagnetLink = torrent.magnetLink && decode(torrent.magnetLink); + const trackers = decodedMagnetLink && decodedMagnetLink.tr || torrent.trackers || await getDefaultTrackers(); const callback = () => resolve(Math.max(...Object.values(seeders).map(values => values[0]).concat(0))); setTimeout(callback, SEEDS_CHECK_TIMEOUT); @@ -30,12 +32,19 @@ module.exports.updateCurrentSeeders = function (torrent) { ready(); }); }, callback); - }).then(seeders => ({ ...torrent, seeders: torrent.seeders || seeders })); + }).then(seeders => { + torrent.seeders = seeders; + return torrent; + }); }; module.exports.updateTorrentSize = function (torrent) { return filesAndSizeFromTorrentStream(torrent, SEEDS_CHECK_TIMEOUT) - .then(result => ({ ...torrent, size: result.size, files: result.files })); + .then(result => { + torrent.size = result.size; + torrent.files = result.files; + return torrent; + }); }; module.exports.sizeAndFiles = torrent => filesAndSizeFromTorrentStream(torrent, 30000); @@ -124,19 +133,9 @@ function filterExtras(files) { return files.filter(file => !isExtra(file)); } -function dynamicTimeout(torrent) { - if (torrent.seeders < 5) { - return 5000; - } else if (torrent.seeders < 10) { - return 7000; - } else if (torrent.seeders < 20) { - return 10000; - } else if (torrent.seeders < 30) { - return 15000; - } else if (torrent.seeders < 50) { - return 20000; - } else { - return 30000; - } +async function getDefaultTrackers() { + return cacheTrackers(() => needle('get', TRACKERS_URL, { open_timeout: SEEDS_CHECK_TIMEOUT }) + .then(response => response.body && response.body.trim()) + .then(body => body && body.split('\n\n') || [])); } diff --git a/scraper/lib/torrentEntries.js b/scraper/lib/torrentEntries.js index 391ea25..27c419d 100644 --- a/scraper/lib/torrentEntries.js +++ b/scraper/lib/torrentEntries.js @@ -63,6 +63,8 @@ async function updateTorrentSeeders(torrent) { .catch(() => undefined) .then(stored => { if (stored) { + console.log( + `Updated seeder for ${torrent.provider} [${torrent.infoHash}] ${torrent.title} from ${stored.seeders} to ${torrent.seeders}`); stored.seeders = torrent.seeders; stored.changed('updatedAt', true); return stored.save() diff --git a/scraper/scrapers/1337x/1337x_scraper.js b/scraper/scrapers/1337x/1337x_scraper.js index 81a95c0..90a1eb6 100644 --- a/scraper/scrapers/1337x/1337x_scraper.js +++ b/scraper/scrapers/1337x/1337x_scraper.js @@ -4,6 +4,7 @@ const leetx = require('./1337x_api'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); const Promises = require('../../lib/promises'); +const { updateCurrentSeeders } = require('../../lib/torrent'); const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries'); const NAME = '1337x'; @@ -25,6 +26,13 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId) + .then(record => (torrent.seeders = record.seeders, torrent)) + .catch(() => updateCurrentSeeders(torrent)) + .then(updated => updateTorrentSeeders(updated))); +} + async function scrapeLatestTorrents() { const allowedCategories = [ leetx.Categories.MOVIE, @@ -80,15 +88,6 @@ async function processTorrentRecord(record) { return createTorrentEntry(torrent); } -async function updateSeeders() { - const startDate = moment().subtract(7, 'day').toDate(); - const endDate = moment().subtract(1, 'day').toDate(); - return repository.getTorrentsUpdatedBetween(NAME, startDate, endDate) - .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => leetx.torrent(torrent.torrentId) - .then(foundTorrent => updateTorrentSeeders(foundTorrent)) - .catch(error => console.warn(error)))))) -} - function typeMapping() { const mapping = {}; mapping[leetx.Categories.MOVIE] = Type.MOVIE; @@ -98,4 +97,4 @@ function typeMapping() { return mapping; } -module.exports = { scrape, NAME }; \ No newline at end of file +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js index 3fe8172..79c1c7a 100644 --- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js @@ -7,6 +7,7 @@ const repository = require('../../lib/repository'); const { Type } = require('../../lib/types'); const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent'); const { parseTorrentFiles } = require('../../lib/torrentFiles'); +const { updateTorrentSeeders } = require('../../lib/torrentEntries'); const { getMetadata, getKitsuId } = require('../../lib/metadata'); const showMappings = require('./horriblesubs_mapping.json'); @@ -36,6 +37,11 @@ async function scrape() { } } +async function updateSeeders(torrent) { + return entryLimiter.schedule(() => updateCurrentSeeders(torrent) + .then(updated => updateTorrentSeeders(updated))); +} + async function _scrapeLatestEntries() { const latestEntries = await horriblesubs.getLatestEntries(); @@ -209,4 +215,4 @@ async function checkIfExists(torrent) { return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders }; } -module.exports = { scrape, NAME }; \ No newline at end of file +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/kickass/kickass_scraper.js b/scraper/scrapers/kickass/kickass_scraper.js index 367b6b1..e52455d 100644 --- a/scraper/scrapers/kickass/kickass_scraper.js +++ b/scraper/scrapers/kickass/kickass_scraper.js @@ -4,6 +4,7 @@ const kickass = require('./kickass_api'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); const Promises = require('../../lib/promises'); +const { updateCurrentSeeders } = require('../../lib/torrent'); const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries'); const NAME = 'KickassTorrents'; @@ -25,6 +26,13 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } +async function updateSeeders(torrent) { + return limiter.schedule(() => kickass.torrent(torrent.torrentId) + .then(record => (torrent.seeders = record.seeders, torrent)) + .catch(() => updateCurrentSeeders(torrent)) + .then(updated => updateTorrentSeeders(updated))); +} + async function scrapeLatestTorrents() { const allowedCategories = [ kickass.Categories.MOVIE, @@ -83,4 +91,4 @@ function typeMapping() { return mapping; } -module.exports = { scrape, NAME }; \ No newline at end of file +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/rarbg/rarbg_scraper.js b/scraper/scrapers/rarbg/rarbg_scraper.js index eea108a..b883c0d 100644 --- a/scraper/scrapers/rarbg/rarbg_scraper.js +++ b/scraper/scrapers/rarbg/rarbg_scraper.js @@ -8,6 +8,7 @@ const Promises = require('../../lib/promises'); const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries'); const NAME = 'RARBG'; +const SEARCH_OPTIONS = { limit: 100, sort: 'seeders', format: 'json_extended', ranked: 0 }; const limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 }); const entryLimiter = new Bottleneck({ maxConcurrent: 40 }); @@ -25,6 +26,17 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } +async function updateSeeders(torrent) { + const imdbIds = await repository.getFiles(torrent) + .then(files => files.map(file => file.imdbId)) + .then(ids => Array.from(new Set(ids))); + + return Promise.all(imdbIds.map(imdbId => limiter.schedule(() => rarbg.search(imdbId, SEARCH_OPTIONS, 'imdb')))) + .then(results => results.reduce((a, b) => a.concat(b), [])) + .then(results => results.map(result => toTorrent(result))) + .then(torrents => Promise.all(torrents.map(updated => updateTorrentSeeders(updated)))); +} + async function scrapeLatestTorrents() { const allowedCategories = [ rarbg.CATEGORY['4K_MOVIES_X264_4k'], @@ -51,17 +63,7 @@ async function scrapeLatestTorrents() { async function scrapeLatestTorrentsForCategory(category) { console.log(`Scrapping ${NAME} ${category} category`); return rarbg.list({ category: category, limit: 100, sort: 'last', format: 'json_extended', ranked: 0 }) - .then(torrents => torrents.map(torrent => ({ - name: torrent.title, - infoHash: decode(torrent.download).infoHash, - magnetLink: torrent.download, - seeders: torrent.seeders, - leechers: torrent.leechers, - category: torrent.category, - size: torrent.size, - uploadDate: new Date(torrent.pubdate), - imdbId: torrent.episode_info && torrent.episode_info.imdb - }))) + .then(results => results.map(result => toTorrent(result))) .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))) .catch(error => { console.warn(`Failed ${NAME} scrapping for ${category} due: `, error); @@ -77,7 +79,7 @@ async function processTorrentRecord(record) { const torrent = { provider: NAME, infoHash: record.infoHash, - title: record.name, + title: record.title, type: getType(record.category), seeders: record.seeders, size: record.size, @@ -88,6 +90,21 @@ async function processTorrentRecord(record) { return createTorrentEntry(torrent); } +function toTorrent(result) { + return { + title: result.title, + provider: NAME, + infoHash: decode(result.download).infoHash, + magnetLink: result.download, + seeders: result.seeders, + leechers: result.leechers, + category: result.category, + size: result.size, + uploadDate: new Date(result.pubdate), + imdbId: result.episode_info && result.episode_info.imdb + }; +} + const seriesCategories = [ 'TV Episodes', 'Movies/TV-UHD-episodes', @@ -101,4 +118,4 @@ function getType(category) { return Type.MOVIE; } -module.exports = { scrape, NAME }; \ No newline at end of file +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js index 592e555..08aced0 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js @@ -4,6 +4,7 @@ const thepiratebay = require('./thepiratebay_api.js'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); const Promises = require('../../lib/promises'); +const { updateCurrentSeeders } = require('../../lib/torrent'); const { createTorrentEntry, getStoredTorrentEntry, updateTorrentSeeders } = require('../../lib/torrentEntries'); const NAME = 'ThePirateBay'; @@ -37,6 +38,13 @@ async function scrape() { .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } +async function updateSeeders(torrent) { + return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId) + .then(record => (torrent.seeders = record.seeders, torrent)) + .catch(() => updateCurrentSeeders(torrent)) + .then(updated => updateTorrentSeeders(updated))); +} + async function scrapeLatestTorrents() { return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category))) .then(entries => entries.reduce((a, b) => a.concat(b), [])); @@ -81,4 +89,4 @@ async function processTorrentRecord(record) { return createTorrentEntry(torrent); } -module.exports = { scrape, NAME }; \ No newline at end of file +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file