diff --git a/addon/lib/filter.js b/addon/lib/filter.js index 8dd5c2d..f0db655 100644 --- a/addon/lib/filter.js +++ b/addon/lib/filter.js @@ -48,6 +48,37 @@ const Providers = { key: 'nyaapantsu', label: 'NyaaPantsu', anime: true + }, + { + key: 'animestorrent', + label: 'AnimesTorrent', + anime: true + foreign: true + }, + { + key: 'comando', + label: 'Comando', + foreign: true + }, + { + key: 'comoeubaixo', + label: 'ComoEuBaixo', + foreign: true + }, + { + key: 'darkmahou', + label: 'DarkMahou', + foreign: true + }, + { + key: 'lapumia', + label: 'Lapumia', + foreign: true + }, + { + key: 'ondebaixa', + label: 'OndeBaixa', + foreign: true } ] }; diff --git a/addon/lib/streamInfo.js b/addon/lib/streamInfo.js index 1854f6b..cd73694 100644 --- a/addon/lib/streamInfo.js +++ b/addon/lib/streamInfo.js @@ -100,6 +100,9 @@ function joinDetailParts(parts, prefix = '', delimiter = ' ') { } function formatSize(size) { + if (!size) { + return undefined; + } if (size === UNKNOWN_SIZE) { return undefined; } diff --git a/beamup.json b/beamup.json new file mode 100644 index 0000000..1e7d674 --- /dev/null +++ b/beamup.json @@ -0,0 +1,4 @@ +{ + "projectName": "brazuca-torrents", + "lastCommit": "a4112bc" +} \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 0000000..924e637 --- /dev/null +++ b/index.js @@ -0,0 +1,20 @@ +const express = require("express"); +const server = express(); +const serverless = require('./addon/serverless') +const { initBestTrackers } = require('./addon/lib/magnetHelper'); +const { connect } = require('./scraper/lib/repository'); +const { startScraper } = require('./scraper/scheduler/scheduler') + +server.get('/', function (req, res) { + res.sendStatus(200); +}); + + +server.use((req, res, next) => serverless(req, res, next)); +server.listen(process.env.PORT || 7000, async () => { + await connect(); + console.log('Scraper started'); + startScraper(); + initBestTrackers() + .then(() => console.log(`Started addon at: http://localhost:${process.env.PORT || 7000}`)); +}); \ No newline at end of file diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index 4f42f9f..dafcd0e 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -170,7 +170,7 @@ function filterVideos(files) { const minSampleRatio = videos.length <= 3 ? 3 : 10; const minAnimeExtraRatio = 5; const minRedundantRatio = videos.length <= 3 ? 30 : Number.MAX_VALUE; - const isSample = video => video.path.match(/sample|bonus/i) && maxSize / parseInt(video.size) > minSampleRatio; + const isSample = video => video.path.match(/sample|bonus|promo/i) && maxSize / parseInt(video.size) > minSampleRatio; const isRedundant = video => maxSize / parseInt(video.size) > minRedundantRatio; const isExtra = video => video.path.match(/extras?\//i); const isAnimeExtra = video => video.path.match(/(?:\b|_)(?:NC)?(?:ED|OP|PV)(?:v?\d\d?)?(?:\b|_)/i) diff --git a/scraper/lib/torrentFiles.js b/scraper/lib/torrentFiles.js index 5a6fab7..ab1dd3d 100644 --- a/scraper/lib/torrentFiles.js +++ b/scraper/lib/torrentFiles.js @@ -78,6 +78,7 @@ async function parseSeriesFiles(torrent, parsedName, metadata) { .then(videos => videos .reduce((a, b) => a.concat(b), []) .map(video => isFeaturette(video) ? clearInfoFields(video) : video)) + return { contents, videos: parsedVideos, subtitles }; } diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js index 5339cbb..864645d 100644 --- a/scraper/scheduler/scrapers.js +++ b/scraper/scheduler/scrapers.js @@ -9,6 +9,13 @@ const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper'); const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper'); const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper'); const rutorScraper = require('../scrapers/rutor/rutor_scraper'); +const AnimesTorrent = require('../scrapers/animestorrent/animestorrent_scraper') +const Comando = require('../scrapers/comando/comando_scraper') +const ComoEuBaixo = require('../scrapers/comoeubaixo/comoeubaixo_scraper') +const DarkMahou = require('../scrapers/darkmahou/darkmahou_scraper') +const Lapumia = require('../scrapers/lapumia/lapumia_scraper') +const OndeBaixa = require('../scrapers/ondebaixa/ondebaixa_scraper'); + module.exports = [ { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' }, @@ -28,4 +35,10 @@ module.exports = [ // { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') } // { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') } // { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') } + { scraper: AnimesTorrent, name: AnimesTorrent.NAME, cron: '0 0 */4 ? * *' }, + { scraper: Comando, name: Comando.NAME, cron: '0 0 */4 ? * *' }, + { scraper: ComoEuBaixo, name: ComoEuBaixo.NAME, cron: '0 0 */4 ? * *' }, + { scraper: DarkMahou, name: DarkMahou.NAME, cron: '0 0 */4 ? * *' }, + { scraper: Lapumia, name: Lapumia.NAME, cron: '0 0 */4 ? * *' }, + { scraper: OndeBaixa, name: OndeBaixa.NAME, cron: '0 0 */4 ? * *' }, ]; diff --git a/scraper/scrapers/animestorrent/animestorrent_api.js b/scraper/scrapers/animestorrent/animestorrent_api.js new file mode 100644 index 0000000..8014d66 --- /dev/null +++ b/scraper/scrapers/animestorrent/animestorrent_api.js @@ -0,0 +1,149 @@ +const needle = require("needle"); +const cheerio = require("cheerio"); +const decode = require("magnet-uri"); +const Promises = require("../../lib/promises"); +const { getRandomUserAgent } = require("../../lib/requestHelper"); + +const defaultTimeout = 10000; +const maxSearchPage = 50; + +const defaultProxies = [ + "https://animestorrent.com" +]; + +const Categories = { + MOVIE: "filme", + ANIME: "tv", + OVA: 'ova' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split("/")[3]; + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}`, config)) + ) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1); + const requestUrl = (proxyUrl) => `${proxyUrl}/page/${page}/?s=${keyword}`; + + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)) + ) + .then((body) => parseTableBody(body)) + .then((torrents) => + torrents.length === 40 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }) + .catch(() => []) + .then((nextTorrents) => torrents.concat(nextTorrents)) + : torrents + ) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = (proxyUrl) => + category + ? `${proxyUrl}/tipo/${category}/page/${page}/` + : `${proxyUrl}/page/${page}/`; + + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)) + ) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { + userAgent: getRandomUserAgent(), + open_timeout: timeout, + follow: 2, + }; + + return needle("get", requestUrl, options).then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if ( + body.includes("502: Bad gateway") || + body.includes("403 Forbidden") + ) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error("Failed loading body")); + } + + const torrents = []; + + $("article.bs").each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("span.ntitle").text(), + torrentId: row.find("div > a").attr("href"), + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise(async(resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error("Failed loading body")); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + const magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const details = $('div.infox') + const torrent = magnets.map((magnetLink) => { + if (magnetLink.includes("magnet")) { + return { + title: decode(magnetLink).name, + original_name: details.find('h1.entry-title').text(), + year: details.find('b:contains(\'Lançamento:\')')[0].nextSibling.nodeValue.trim(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4], + uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")), + }; + } + }) + resolve(torrent.filter((x) => x)); + }); +} + +module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/animestorrent/animestorrent_scraper.js b/scraper/scrapers/animestorrent/animestorrent_scraper.js new file mode 100644 index 0000000..1f71b43 --- /dev/null +++ b/scraper/scrapers/animestorrent/animestorrent_scraper.js @@ -0,0 +1,131 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./animestorrent_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "AnimesTorrent"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.ANIME, + leetx.Categories.OVA + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + title: torrentFound.title.replace(/\t|\s+/g, " ").trim(), + type: Type.ANIME, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.ANIME] = Type.SERIES; + mapping[leetx.Categories.OVA] = Type.ANIME + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.ANIME === category) { + return 5; + } + if (leetx.Categories.OVA === category) { + return 3; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME }; diff --git a/scraper/scrapers/comando/comando_api.js b/scraper/scrapers/comando/comando_api.js new file mode 100644 index 0000000..4ebd640 --- /dev/null +++ b/scraper/scrapers/comando/comando_api.js @@ -0,0 +1,182 @@ +const needle = require("needle") +const moment = require("moment") +const cheerio = require("cheerio"); +const decode = require('magnet-uri'); +const Promises = require('../../lib/promises'); +const { escapeHTML } = require('../../lib/metadata'); +const { getRandomUserAgent } = require("../../lib/requestHelper"); +moment.locale("pt-br"); + +const defaultTimeout = 10000; +const maxSearchPage = 50 + +const defaultProxies = [ + 'https://comando.to' +]; + +const Categories = { + MOVIE: 'filmes', + TV: 'series', + ANIME: 'animes', + DOCUMENTARIES: 'documentario' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split("/")[3]; + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}`, config))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1)) + const requestUrl = proxyUrl => `${proxyUrl}/page/${page}/?s=${keyword}` + + return Promises.first(proxyList + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .then(body => parseTableBody(body)) + .then(torrents => torrents.length === 40 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }).catch(() => []) + .then(nextTorrents => torrents.concat(nextTorrents)) + : torrents) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = proxyUrl => `${proxyUrl}/category/${category}/page/${page}/` + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('article').each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("h2 > a").text(), + torrentId: row.find("h2 > a").attr("href") + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + let magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const details = $('b:contains(\'Original\')').parent() + const isAnime = parseCategory($('div.entry-categories').html()) === Categories.ANIME + const imdbIdMatch = details.find('a[href*="imdb.com"]').attr('href') + const torrent = magnets.map(magnetLink => { + const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) + if(isDubled(name) || isAnime) { + return { + name: name.replace(/ /g, '.'), + original_name: parseName(details.find('b:contains(\'Original\')')[0].nextSibling.nodeValue.replace(':', '')), + year: details.find('a[href*="comando.to/category/"]').text(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: parseCategory($('div.entry-categories').html()), + uploadDate: new Date(moment($('a.updated').text(), 'LL', true).format()), + imdbId: imdbIdMatch ? imdbIdMatch.split('/')[4] : null + }; + } + }) + resolve(torrent.filter((x) => x)); + }); +} + +function parseName(name) { + return name + .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') + .trim() +} + +function isDubled(name){ + name = name.toLowerCase() + if(name.includes('dublado')){ + return true + } + if(name.includes('dual')){ + return true + } + if(name.includes('nacional')){ + return true + } + if(name.includes('multi')){ + return true + } + return false +} + +function parseCategory(categorys) { + const $ = cheerio.load(categorys) + const isAnime = $('a:contains(\'animes\')').text() + const isMovie = $('a:contains(\'Filmes\')').text() + const isSerie = $('a:contains(\'Series\')').text() + if(isAnime) { + return Categories.ANIME + } + if (isMovie) { + return Categories.MOVIE + } + if(isSerie) { + return Categories.TV + } +} + +module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/comando/comando_scraper.js b/scraper/scrapers/comando/comando_scraper.js new file mode 100644 index 0000000..3e86ab0 --- /dev/null +++ b/scraper/scrapers/comando/comando_scraper.js @@ -0,0 +1,136 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./comando_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "Comando"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.TV, + leetx.Categories.ANIME, + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return await Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + name: torrentFound.original_name, + title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), + type: TYPE_MAPPING[torrentFound.category], + year: torrentFound.year, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + size: torrentFound.size, + files: torrentFound.files + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.DOCUMENTARIES] = Type.SERIES; + mapping[leetx.Categories.TV] = Type.SERIES; + mapping[leetx.Categories.ANIME] = Type.ANIME; + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.ANIME === category) { + return 5; + } + if (leetx.Categories.TV === category) { + return 5; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME }; diff --git a/scraper/scrapers/comoeubaixo/comoeubaixo_api.js b/scraper/scrapers/comoeubaixo/comoeubaixo_api.js new file mode 100644 index 0000000..d17dc39 --- /dev/null +++ b/scraper/scrapers/comoeubaixo/comoeubaixo_api.js @@ -0,0 +1,174 @@ +const needle = require("needle") +const cheerio = require("cheerio"); +const decode = require('magnet-uri'); +const Promises = require('../../lib/promises'); +const { escapeHTML } = require('../../lib/metadata'); +const { getRandomUserAgent } = require("../../lib/requestHelper"); + +const defaultTimeout = 10000; +const maxSearchPage = 50 + +const defaultProxies = [ + 'https://comoeubaixo.com' +]; + +const Categories = { + MOVIE: 'filmes', + TV: 'series', + ANIME: 'anime', + DESENHOS: 'desenhos' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split("/")[3]; + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}/`, config))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1)) + const requestUrl = proxyUrl => `${proxyUrl}/${keyword}/${page}/` + + return Promises.first(proxyList + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .then(body => parseTableBody(body)) + .then(torrents => torrents.length === 40 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }).catch(() => []) + .then(nextTorrents => torrents.concat(nextTorrents)) + : torrents) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = proxyUrl => category ? `${proxyUrl}/${category}/${page}/` : `${proxyUrl}/${page}/`; + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('div.capa_larga.align-middle').each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("a").text(), + torrentId: row.find("a").attr("href") + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + let magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const details = $('div#informacoes') + const category = details.find('strong:contains(\'Gêneros: \')').next().attr('href').split('/')[0] + const isAnime = category === Categories.ANIME + const torrent = magnets.map(magnetLink => { + const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) + if(isDubled(name) || isAnime) { + return { + name: parseText(name), + original_name: parseName(details.find('strong:contains(\'Baixar\')')[0].nextSibling.nodeValue.split('-')[0]), + year: details.find('strong:contains(\'Data de Lançamento: \')').next().text().trim(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: category, + uploadDate: new Date($('time').attr('datetime')), + imdbId: details.find('a[href*="imdb.com"]').attr('href').split('/')[4], + }; + } + }) + resolve(torrent.filter((x) => x)); + }); +} + +function parseName(name) { + return name + .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') + .trim() +} + +function isDubled(name){ + name = name.toLowerCase() + if(name.includes('dublado')){ + return true + } + if(name.includes('dual')){ + return true + } + if(name.includes('nacional')){ + return true + } + return false +} + +function parseText(text) { + return text + .replace(/\n|\t/g, "") + .replace(/1A|2A|3A|4A|5A|6A|7A|8A|9A/g, '') + .replace(/COMOEUBAIXO.COM|COMANDO.TO|TEMPORADA|COMPLETA/g, '') + .replace(/MKV|MP4/g, '') + .replace(/[-]/g, '') + .replace(/[.]/g, ' ') + .trim() + .replace(/ /g, '.') + .trim() +} + +module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js b/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js new file mode 100644 index 0000000..d8991fb --- /dev/null +++ b/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js @@ -0,0 +1,137 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./comoeubaixo_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "ComoEuBaixo"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.TV, + leetx.Categories.DESENHOS + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return await Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + name: torrentFound.original_name, + title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), + type: TYPE_MAPPING[torrentFound.category], + year: torrentFound.year, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + size: torrentFound.size, + files: torrentFound.files + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.TV] = Type.SERIES; + mapping[leetx.Categories.ANIME] = Type.ANIME; + mapping[leetx.Categories.DESENHOS] = Type.SERIES; + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.DESENHOS === category) { + return 5; + } + if (leetx.Categories.TV === category) { + return 5; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME }; diff --git a/scraper/scrapers/darkmahou/darkmahou_api.js b/scraper/scrapers/darkmahou/darkmahou_api.js new file mode 100644 index 0000000..479a3f0 --- /dev/null +++ b/scraper/scrapers/darkmahou/darkmahou_api.js @@ -0,0 +1,150 @@ +const needle = require("needle"); +const Bottleneck = require("bottleneck"); +const cheerio = require("cheerio"); +const decode = require("magnet-uri"); +const Promises = require("../../lib/promises"); +const { getRandomUserAgent } = require("../../lib/requestHelper"); + +const defaultTimeout = 10000; +const maxSearchPage = 50; + +const limiter = new Bottleneck({ maxConcurrent: 10 }); + +const defaultProxies = ["https://darkmahou.com"]; + +const Categories = { + MOVIE: "movie", + ANIME: "tv", + OVA: 'ova' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split("/")[3]; + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}`, config)) + ) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1); + const requestUrl = (proxyUrl) => `${proxyUrl}/page/${page}/?s=${keyword}`; + + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)) + ) + .then((body) => parseTableBody(body)) + .then((torrents) => + torrents.length === 40 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }) + .catch(() => []) + .then((nextTorrents) => torrents.concat(nextTorrents)) + : torrents + ) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = (proxyUrl) => + category + ? `${proxyUrl}/category/${category}/page/${page}/` + : `${proxyUrl}/page/${page}/`; + + return Promises.first( + proxyList.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)) + ) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { + userAgent: getRandomUserAgent(), + open_timeout: timeout, + follow: 2, + }; + + return needle("get", requestUrl, options).then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if ( + body.includes("502: Bad gateway") || + body.includes("403 Forbidden") + ) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error("Failed loading body")); + } + + const torrents = []; + + $("article.bs").each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("span.ntitle").text(), + torrentId: row.find("div > a").attr("href"), + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise(async(resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error("Failed loading body")); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + const magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const details = $('div.infox') + const torrent = magnets.map((magnetLink) => { + if (magnetLink.includes("magnet")) { + return { + title: decode(magnetLink).name, + original_name: details.find('h1.entry-title').text(), + year: details.find('b:contains(\'Lançado:\')')[0].nextSibling.nodeValue, + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4], + uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")), + }; + } + }) + resolve(torrent.filter((x) => x)); + }); +} + +module.exports = { torrent, search, browse, Categories }; diff --git a/scraper/scrapers/darkmahou/darkmahou_scraper.js b/scraper/scrapers/darkmahou/darkmahou_scraper.js new file mode 100644 index 0000000..5076834 --- /dev/null +++ b/scraper/scrapers/darkmahou/darkmahou_scraper.js @@ -0,0 +1,131 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./darkmahou_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "DarkMahou"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.ANIME, + leetx.Categories.OVA + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + title: torrentFound.title.replace(/\t|\s+/g, " ").trim(), + type: Type.ANIME, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.ANIME] = Type.SERIES; + mapping[leetx.Categories.OVA] = Type.ANIME + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.ANIME === category) { + return 5; + } + if (leetx.Categories.OVA === category) { + return 4; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME }; diff --git a/scraper/scrapers/lapumia/lapumia_api.js b/scraper/scrapers/lapumia/lapumia_api.js new file mode 100644 index 0000000..206833c --- /dev/null +++ b/scraper/scrapers/lapumia/lapumia_api.js @@ -0,0 +1,177 @@ +const needle = require("needle") +const moment = require("moment") +const cheerio = require("cheerio"); +const decode = require('magnet-uri'); +const Promises = require('../../lib/promises'); +const { escapeHTML } = require('../../lib/metadata'); +const { getRandomUserAgent } = require("../../lib/requestHelper"); +moment.locale("pt-br"); + +const defaultTimeout = 10000; +const maxSearchPage = 50 + +const defaultProxies = [ + 'https://lapumia.org' +]; + +const Categories = { + MOVIE: null, + TV: 'series', + ANIME: 'animes', +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split('?p=')[1]; + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/?p=${slug}`, config))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1)) + const requestUrl = proxyUrl => `${proxyUrl}/page/${page}/?s=${keyword}` + + return Promises.first(proxyList + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .then(body => parseTableBody(body)) + .then(torrents => torrents.length === 10 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }).catch(() => []) + .then(nextTorrents => torrents.concat(nextTorrents)) + : torrents) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = proxyUrl => category ? `${proxyUrl}/${category}/page/${page}/` : `${proxyUrl}/page/${page}/` + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('div.post').each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("div > a").text(), + torrentId: row.find("div > a").attr("href") + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + let magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const categorys = $('div.category').html(); + const details = $('div.content') + const isAnime = parseCategory(categorys) === Categories.ANIME + const torrent = magnets.map(magnetLink => { + const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) + if(isDubled(name) || isAnime) { + return { + name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')), + original_name: parseName(details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue), + year: details.find('b:contains(\'Ano de Lançamento:\')')[0].nextSibling.nodeValue.trim(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: parseCategory(categorys), + uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', true).format()), + imdbId: $('.imdbRatingPlugin').attr('data-title') || null + }; + }; + }) + resolve(torrent.filter((x) => x)); + }); +} + +function parseName(name) { + return name + .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') +} + +function isDubled(name){ + name = name.toLowerCase() + if(name.includes('dublado')){ + return true + } + if(name.includes('dual')){ + return true + } + if(name.includes('nacional')){ + return true + } + if(name.includes('multi')){ + return true + } + return false +} + +function parseCategory(categorys) { + const $ = cheerio.load(categorys) + const isAnime = $('a:contains(\'Animes\')').text() + const isSerie = $('a:contains(\'Series\')').text() + if(isAnime) { + return Categories.ANIME + } + if(isSerie) { + return Categories.TV + } + return Categories.MOVIE +} + +module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/lapumia/lapumia_scraper.js b/scraper/scrapers/lapumia/lapumia_scraper.js new file mode 100644 index 0000000..3fe2da2 --- /dev/null +++ b/scraper/scrapers/lapumia/lapumia_scraper.js @@ -0,0 +1,136 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./lapumia_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "Lapumia"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.TV, + leetx.Categories.ANIME, + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return await Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + name: torrentFound.original_name, + title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), + type: TYPE_MAPPING[torrentFound.category], + year: torrentFound.year, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + size: torrentFound.size, + files: torrentFound.files + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.TV] = Type.SERIES; + mapping[leetx.Categories.ANIME] = Type.ANIME; + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.TV === category) { + return 5; + } + if (leetx.Categories.ANIME === category) { + return 2; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME }; diff --git a/scraper/scrapers/ondebaixa/ondebaixa_api.js b/scraper/scrapers/ondebaixa/ondebaixa_api.js new file mode 100644 index 0000000..bb24e01 --- /dev/null +++ b/scraper/scrapers/ondebaixa/ondebaixa_api.js @@ -0,0 +1,193 @@ +const needle = require("needle") +const cheerio = require("cheerio"); +const decode = require('magnet-uri'); +const Promises = require('../../lib/promises'); +const { escapeHTML } = require('../../lib/metadata'); +const { getRandomUserAgent } = require("../../lib/requestHelper"); + +const defaultTimeout = 10000; +const maxSearchPage = 50 + +const defaultProxies = [ + 'https://ondebaixa.com' +]; + +const Categories = { + MOVIE: 'filmes', + TV: 'series', + ANIME: 'anime', + DESENHOS: 'desenhos' +}; + +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} query`)); + } + const proxyList = config.proxyList || defaultProxies; + const slug = torrentId.split("/")[3]; + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}/`, config))) + .then((body) => parseTorrentPage(body)) + .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) + .catch((err) => torrent(slug, config, retries - 1)); +} + +function search(keyword, config = {}, retries = 2) { + if (!keyword || retries === 0) { + return Promise.reject(new Error(`Failed ${keyword} search`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1)) + const requestUrl = proxyUrl => `${proxyUrl}/${keyword}/${page}/` + + return Promises.first(proxyList + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .then(body => parseTableBody(body)) + .then(torrents => torrents.length === 40 && page < extendToPage + ? search(keyword, { ...config, page: page + 1 }).catch(() => []) + .then(nextTorrents => torrents.concat(nextTorrents)) + : torrents) + .catch((err) => search(keyword, config, retries - 1)); +} + +function browse(config = {}, retries = 2) { + if (retries === 0) { + return Promise.reject(new Error(`Failed browse request`)); + } + const proxyList = config.proxyList || defaultProxies; + const page = config.page || 1; + const category = config.category; + const requestUrl = proxyUrl => category ? `${proxyUrl}/${category}/${page}/` : `${proxyUrl}/${page}/`; + + return Promises.first(proxyList + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .then((body) => parseTableBody(body)) + .catch((err) => browse(config, retries - 1)); +} + +function singleRequest(requestUrl, config = {}) { + const timeout = config.timeout || defaultTimeout; + const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 }; + + return needle('get', requestUrl, options) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || + body.includes('403 Forbidden')) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); +} + +function parseTableBody(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + + const torrents = []; + + $('div.capa_larga.align-middle').each((i, element) => { + const row = $(element); + torrents.push({ + name: row.find("a").text(), + torrentId: row.find("a").attr("href") + }); + }); + resolve(torrents); + }); +} + +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error('Failed loading body')); + } + let magnets = []; + $(`a[href^="magnet"]`).each((i, section) => { + let magnet = $(section).attr("href"); + magnets.push(magnet); + }); + const details = $('div#informacoes') + const category = details.find('span:contains(\'Gêneros: \')').next().html() + const isAnime = parseCategory(category) === Categories.ANIME + const torrent = magnets.map(magnetLink => { + const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) + if(isDubled(name) || isAnime) { + return { + name: parseText(name), + original_name: parseName(details.find('span:contains(\'Título Original: \')').next().text()), + year: details.find('span:contains(\'Ano de Lançamento: \')').next().text().trim(), + infoHash: decode(magnetLink).infoHash, + magnetLink: magnetLink, + category: parseCategory(category), + uploadDate: new Date($('time').attr('datetime')), + }; + } + }) + resolve(torrent.filter((x) => x)); + }); +} + +function parseName(name) { + return name + .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') + .trim() +} + +function isDubled(name){ + name = name.toLowerCase() + if(name.includes('dublado')){ + return true + } + if(name.includes('dual')){ + return true + } + if(name.includes('nacional')){ + return true + } + return false +} + +function parseCategory(body) { + const $ = cheerio.load(body) + const isAnime = $("a[href*='anime']").text() + const isSerie = $("a[href*='series']").text() + const isMovie = $("a[href*='filmes']").text() + const isDesenho = $("a[href*='desenhos']").text() + if(isAnime) { + return Categories.ANIME + } + if(isSerie) { + return Categories.TV + } + if(isMovie) { + return Categories.MOVIE + } + if(isDesenho) { + return Categories.TV + } +} + +function parseText(text) { + return text + .replace(/\n|\t/g, "") + .replace(/1A|2A|3A|4A|5A|6A|7A|8A|9A/g, '') + .replace(/COMOEUBAIXO.COM|COMANDO.TO|TEMPORADA|COMPLETA/g, '') + .replace(/MKV|MP4/g, '') + .replace(/[-]/g, '') + .replace(/[.]/g, ' ') + .trim() + .replace(/ /g, '.') + .trim() +} + +module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/ondebaixa/ondebaixa_scraper.js b/scraper/scrapers/ondebaixa/ondebaixa_scraper.js new file mode 100644 index 0000000..93fe08d --- /dev/null +++ b/scraper/scrapers/ondebaixa/ondebaixa_scraper.js @@ -0,0 +1,136 @@ +const moment = require("moment"); +const Bottleneck = require("bottleneck"); +const leetx = require("./ondebaixa_api"); +const { Type } = require("../../lib/types"); +const repository = require("../../lib/repository"); +const Promises = require("../../lib/promises"); +const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); +const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); +const { getImdbId } = require("../../lib/metadata"); + +const NAME = "OndeBaixa"; +const UNTIL_PAGE = 5; +const TYPE_MAPPING = typeMapping(); + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => leetx.torrent(torrent.torrentId)); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + leetx.Categories.MOVIE, + leetx.Categories.TV, + leetx.Categories.DESENHOS + ]; + + return Promises.sequence( + allowedCategories.map( + (category) => () => scrapeLatestTorrentsForCategory(category) + ) + ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); + return leetx + .browse({ category, page }) + .catch((error) => { + console.warn( + `Failed ${NAME} scrapping for [${page}] ${category} due: `, + error + ); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { + return record; + } + const torrentEntrys = await leetx + .torrent(record.torrentId) + .catch(() => undefined); + if (torrentEntrys === undefined) { + return Promise.resolve([]) + } + return await Promise.allSettled( + torrentEntrys.map(async (torrentFound) => { + if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { + return Promise.resolve("Invalid torrent record"); + } + if (isNaN(torrentFound.uploadDate)) { + console.warn( + `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` + ); + return; + } + if (await checkAndUpdateTorrent(torrentFound)) { + return torrentFound; + } + if (!torrentFound.size) { + await updateTorrentSize(torrentFound) + .catch((err) => Promise.resolve(err)) + } + if (!torrentFound.seeders) { + await updateCurrentSeeders(torrentFound) + .then(response => response.seeders === 0 ? delete response.seeders : response) + } + if (!torrentFound.imdbId) { + torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) + } + + const torrent = { + infoHash: torrentFound.infoHash, + provider: NAME, + torrentId: torrentFound.torrentId, + name: torrentFound.original_name, + title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), + type: TYPE_MAPPING[torrentFound.category], + year: torrentFound.year, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + size: torrentFound.size, + files: torrentFound.files + }; + return createTorrentEntry(torrent); + }) + ); +} + +function typeMapping() { + const mapping = {}; + mapping[leetx.Categories.MOVIE] = Type.MOVIE; + mapping[leetx.Categories.TV] = Type.SERIES; + mapping[leetx.Categories.ANIME] = Type.ANIME; + return mapping; +} + +function untilPage(category) { + if (leetx.Categories.DESENHOS === category) { + return 5; + } + if (leetx.Categories.TV === category) { + return 5; + } + return UNTIL_PAGE; +} + +module.exports = { scrape, updateSeeders, NAME };