diff --git a/scraper/scrapers/comoeubaixo/comoeubaixo_api.js b/scraper/scrapers/comoeubaixo/comoeubaixo_api.js index d17dc39..5eee3ad 100644 --- a/scraper/scrapers/comoeubaixo/comoeubaixo_api.js +++ b/scraper/scrapers/comoeubaixo/comoeubaixo_api.js @@ -3,7 +3,8 @@ const cheerio = require("cheerio"); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); const { escapeHTML } = require('../../lib/metadata'); -const { getRandomUserAgent } = require("../../lib/requestHelper"); +const { getRandomUserAgent } = require('../../lib/requestHelper'); +const { isPtDubbed, sanitizePtName, sanitizePtLanguages } = require('../scraperHelper') const defaultTimeout = 10000; const maxSearchPage = 50 @@ -26,7 +27,7 @@ function torrent(torrentId, config = {}, retries = 2) { const proxyList = config.proxyList || defaultProxies; const slug = torrentId.split("/")[3]; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}/`, config))) + .map((proxyUrl) => singleRequest(`${proxyUrl}/${slug}/`, config))) .then((body) => parseTorrentPage(body)) .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) .catch((err) => torrent(slug, config, retries - 1)); @@ -42,7 +43,7 @@ function search(keyword, config = {}, retries = 2) { const requestUrl = proxyUrl => `${proxyUrl}/${keyword}/${page}/` return Promises.first(proxyList - .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) .then(body => parseTableBody(body)) .then(torrents => torrents.length === 40 && page < extendToPage ? search(keyword, { ...config, page: page + 1 }).catch(() => []) @@ -61,7 +62,7 @@ function browse(config = {}, retries = 2) { const requestUrl = proxyUrl => category ? `${proxyUrl}/${category}/${page}/` : `${proxyUrl}/${page}/`; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) .then((body) => parseTableBody(body)) .catch((err) => browse(config, retries - 1)); } @@ -97,12 +98,13 @@ function parseTableBody(body) { const row = $(element); torrents.push({ name: row.find("a").text(), - torrentId: row.find("a").attr("href") + torrentId: row.find("a").attr("href"), + isTorrent: !!row.find("p:contains(\'Torrent\')")[0] }); }); resolve(torrents); }); -} +} function parseTorrentPage(body) { return new Promise((resolve, reject) => { @@ -121,16 +123,19 @@ function parseTorrentPage(body) { const isAnime = category === Categories.ANIME const torrent = magnets.map(magnetLink => { const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) - if(isDubled(name) || isAnime) { + const sanitizedTitle = sanitizePtName(name); + const originalTitle = details.find('strong:contains(\'Baixar\')')[0].nextSibling.nodeValue.split('-')[0]; + const year = details.find('strong:contains(\'Data de Lançamento: \')').next().text().trim(); + const fallBackTitle = `${originalTitle.trim()} ${year.trim()} ${sanitizedTitle.trim()}`; + if (isPtDubbed(name) || isAnime) { return { - name: parseText(name), - original_name: parseName(details.find('strong:contains(\'Baixar\')')[0].nextSibling.nodeValue.split('-')[0]), - year: details.find('strong:contains(\'Data de Lançamento: \')').next().text().trim(), + title: sanitizedTitle.length > 4 ? sanitizedTitle : fallBackTitle, infoHash: decode(magnetLink).infoHash, magnetLink: magnetLink, category: category, uploadDate: new Date($('time').attr('datetime')), imdbId: details.find('a[href*="imdb.com"]').attr('href').split('/')[4], + languages: sanitizePtLanguages(details.find('strong:contains(\'Idioma\')')[0].nextSibling.nodeValue) }; } }) @@ -138,37 +143,4 @@ function parseTorrentPage(body) { }); } -function parseName(name) { - return name - .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') - .trim() -} - -function isDubled(name){ - name = name.toLowerCase() - if(name.includes('dublado')){ - return true - } - if(name.includes('dual')){ - return true - } - if(name.includes('nacional')){ - return true - } - return false -} - -function parseText(text) { - return text - .replace(/\n|\t/g, "") - .replace(/1A|2A|3A|4A|5A|6A|7A|8A|9A/g, '') - .replace(/COMOEUBAIXO.COM|COMANDO.TO|TEMPORADA|COMPLETA/g, '') - .replace(/MKV|MP4/g, '') - .replace(/[-]/g, '') - .replace(/[.]/g, ' ') - .trim() - .replace(/ /g, '.') - .trim() -} - module.exports = { torrent, search, browse, Categories }; \ No newline at end of file diff --git a/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js b/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js index d8991fb..1c7381c 100644 --- a/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js +++ b/scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js @@ -1,15 +1,14 @@ const moment = require("moment"); const Bottleneck = require("bottleneck"); -const leetx = require("./comoeubaixo_api"); +const comoeubaixo = require("./comoeubaixo_api"); const { Type } = require("../../lib/types"); const repository = require("../../lib/repository"); const Promises = require("../../lib/promises"); const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries"); const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent"); -const { getImdbId } = require("../../lib/metadata"); const NAME = "ComoEuBaixo"; -const UNTIL_PAGE = 5; +const UNTIL_PAGE = 3; const TYPE_MAPPING = typeMapping(); const limiter = new Bottleneck({ maxConcurrent: 5 }); @@ -20,116 +19,95 @@ async function scrape() { console.log(`[${scrapeStart}] starting ${NAME} scrape...`); return scrapeLatestTorrents() - .then(() => { - lastScrape.lastScraped = scrapeStart; - return lastScrape.save(); - }) - .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } async function updateSeeders(torrent) { - return limiter.schedule(() => leetx.torrent(torrent.torrentId)); + return limiter.schedule(() => comoeubaixo.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() { const allowedCategories = [ - leetx.Categories.MOVIE, - leetx.Categories.TV, - leetx.Categories.DESENHOS + comoeubaixo.Categories.MOVIE, + comoeubaixo.Categories.TV, + comoeubaixo.Categories.DESENHOS ]; - return Promises.sequence( - allowedCategories.map( - (category) => () => scrapeLatestTorrentsForCategory(category) - ) - ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); + return Promises.sequence(allowedCategories + .map((category) => () => scrapeLatestTorrentsForCategory(category))) + .then((entries) => entries.reduce((a, b) => a.concat(b), [])); } -async function scrapeLatestTorrentsForCategory(category, page = 1) { - console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); - return leetx - .browse({ category, page }) - .catch((error) => { - console.warn( - `Failed ${NAME} scrapping for [${page}] ${category} due: `, - error - ); - return Promise.resolve([]); - }) - .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) - .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); +async function scrapeLatestTorrentsForCategory(category, page = 180) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return comoeubaixo + .browse({ category, page }) + .catch((error) => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) + ? scrapeLatestTorrentsForCategory(category, page + 1) + : Promise.resolve()); } -async function processTorrentRecord(record) { - if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { - return record; +async function processEntry(entry) { + if (!entry.isTorrent) { + return entry; } - const torrentEntrys = await leetx - .torrent(record.torrentId) - .catch(() => undefined); - if (torrentEntrys === undefined) { - return Promise.resolve([]) - } - return await Promise.allSettled( - torrentEntrys.map(async (torrentFound) => { - if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { - return Promise.resolve("Invalid torrent record"); - } - if (isNaN(torrentFound.uploadDate)) { - console.warn( - `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` - ); - return; - } - if (await checkAndUpdateTorrent(torrentFound)) { - return torrentFound; - } - if (!torrentFound.size) { - await updateTorrentSize(torrentFound) - .catch((err) => Promise.resolve(err)) - } - if (!torrentFound.seeders) { - await updateCurrentSeeders(torrentFound) - .then(response => response.seeders === 0 ? delete response.seeders : response) - } - if (!torrentFound.imdbId) { - torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) - } + return comoeubaixo.torrent(entry.torrentId) + .then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record)))) + .catch(() => undefined); +} - const torrent = { - infoHash: torrentFound.infoHash, - provider: NAME, - torrentId: torrentFound.torrentId, - name: torrentFound.original_name, - title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), - type: TYPE_MAPPING[torrentFound.category], - year: torrentFound.year, - imdbId: torrentFound.imdbId, - uploadDate: torrentFound.uploadDate, - seeders: torrentFound.seeders, - size: torrentFound.size, - files: torrentFound.files - }; - return createTorrentEntry(torrent); - }) - ); +async function processTorrentRecord(foundTorrent) { + if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) { + return foundTorrent; + } + + if (!foundTorrent.size) { + await updateTorrentSize(foundTorrent); + } + if (!Number.isInteger(foundTorrent.seeders)) { + await updateCurrentSeeders(foundTorrent); + } + + const torrent = { + infoHash: foundTorrent.infoHash, + provider: NAME, + torrentId: foundTorrent.torrentId, + title: foundTorrent.title, + type: TYPE_MAPPING[foundTorrent.category], + imdbId: foundTorrent.imdbId, + uploadDate: foundTorrent.uploadDate, + seeders: foundTorrent.seeders, + size: foundTorrent.size, + files: foundTorrent.files, + languages: foundTorrent.languages + }; + return createTorrentEntry(torrent); } function typeMapping() { const mapping = {}; - mapping[leetx.Categories.MOVIE] = Type.MOVIE; - mapping[leetx.Categories.TV] = Type.SERIES; - mapping[leetx.Categories.ANIME] = Type.ANIME; - mapping[leetx.Categories.DESENHOS] = Type.SERIES; + mapping[comoeubaixo.Categories.MOVIE] = Type.MOVIE; + mapping[comoeubaixo.Categories.TV] = Type.SERIES; + mapping[comoeubaixo.Categories.ANIME] = Type.ANIME; + mapping[comoeubaixo.Categories.DESENHOS] = Type.SERIES; return mapping; } function untilPage(category) { - if (leetx.Categories.DESENHOS === category) { - return 5; + if (comoeubaixo.Categories.DESENHOS === category) { + return UNTIL_PAGE; } - if (leetx.Categories.TV === category) { - return 5; + if (comoeubaixo.Categories.TV === category) { + return UNTIL_PAGE; } return UNTIL_PAGE; }