From 3379133ad2f8bbb2dc557946e50d29d7aaa408fd Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Tue, 14 Sep 2021 15:15:02 +0200 Subject: [PATCH] format lapumia provider --- scraper/scrapers/lapumia/lapumia_api.js | 62 +++------ scraper/scrapers/lapumia/lapumia_scraper.js | 146 +++++++++----------- 2 files changed, 82 insertions(+), 126 deletions(-) diff --git a/scraper/scrapers/lapumia/lapumia_api.js b/scraper/scrapers/lapumia/lapumia_api.js index 206833c..b111032 100644 --- a/scraper/scrapers/lapumia/lapumia_api.js +++ b/scraper/scrapers/lapumia/lapumia_api.js @@ -4,8 +4,8 @@ const cheerio = require("cheerio"); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); const { escapeHTML } = require('../../lib/metadata'); -const { getRandomUserAgent } = require("../../lib/requestHelper"); -moment.locale("pt-br"); +const { getRandomUserAgent } = require('../../lib/requestHelper'); +const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper') const defaultTimeout = 10000; const maxSearchPage = 50 @@ -27,7 +27,7 @@ function torrent(torrentId, config = {}, retries = 2) { const proxyList = config.proxyList || defaultProxies; const slug = torrentId.split('?p=')[1]; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(`${proxyUrl}/?p=${slug}`, config))) + .map((proxyUrl) => singleRequest(`${proxyUrl}/?p=${slug}`, config))) .then((body) => parseTorrentPage(body)) .then((torrent) => torrent.map(el => ({ torrentId: slug, ...el }))) .catch((err) => torrent(slug, config, retries - 1)); @@ -43,7 +43,7 @@ function search(keyword, config = {}, retries = 2) { const requestUrl = proxyUrl => `${proxyUrl}/page/${page}/?s=${keyword}` return Promises.first(proxyList - .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) + .map(proxyUrl => singleRequest(requestUrl(proxyUrl), config))) .then(body => parseTableBody(body)) .then(torrents => torrents.length === 10 && page < extendToPage ? search(keyword, { ...config, page: page + 1 }).catch(() => []) @@ -62,7 +62,7 @@ function browse(config = {}, retries = 2) { const requestUrl = proxyUrl => category ? `${proxyUrl}/${category}/page/${page}/` : `${proxyUrl}/page/${page}/` return Promises.first(proxyList - .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) + .map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config))) .then((body) => parseTableBody(body)) .catch((err) => browse(config, retries - 1)); } @@ -103,7 +103,7 @@ function parseTableBody(body) { }); resolve(torrents); }); -} +} function parseTorrentPage(body) { return new Promise((resolve, reject) => { @@ -117,58 +117,36 @@ function parseTorrentPage(body) { let magnet = $(section).attr("href"); magnets.push(magnet); }); - const categorys = $('div.category').html(); + const category = parseCategory($('div.category').html()); const details = $('div.content') - const isAnime = parseCategory(categorys) === Categories.ANIME + const isAnime = category === Categories.ANIME const torrent = magnets.map(magnetLink => { const name = escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')) - if(isDubled(name) || isAnime) { + if (isPtDubbed(name) || isAnime) { return { - name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')), - original_name: parseName(details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue), + title: sanitizePtName(name), + originalName: sanitizePtOriginalName( + details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue), year: details.find('b:contains(\'Ano de Lançamento:\')')[0].nextSibling.nodeValue.trim(), infoHash: decode(magnetLink).infoHash, magnetLink: magnetLink, - category: parseCategory(categorys), - uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', true).format()), - imdbId: $('.imdbRatingPlugin').attr('data-title') || null + category: category, + uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', 'pt-br').format()), + imdbId: $('.imdbRatingPlugin').attr('data-title') || null, + languages: sanitizePtLanguages(details.find('b:contains(\'Idioma\')')[0].nextSibling.nodeValue) }; - }; + } }) resolve(torrent.filter((x) => x)); }); } -function parseName(name) { - return name - .replace(/S01|S02|S03|S04|S05|S06|S07|S08|S09/g, '') -} - -function isDubled(name){ - name = name.toLowerCase() - if(name.includes('dublado')){ - return true - } - if(name.includes('dual')){ - return true - } - if(name.includes('nacional')){ - return true - } - if(name.includes('multi')){ - return true - } - return false -} - function parseCategory(categorys) { const $ = cheerio.load(categorys) - const isAnime = $('a:contains(\'Animes\')').text() - const isSerie = $('a:contains(\'Series\')').text() - if(isAnime) { + if ($('a:contains(\'Animes\')').text()) { return Categories.ANIME - } - if(isSerie) { + } + if ($('a:contains(\'Series\')').text()) { return Categories.TV } return Categories.MOVIE diff --git a/scraper/scrapers/lapumia/lapumia_scraper.js b/scraper/scrapers/lapumia/lapumia_scraper.js index 3fe2da2..f066643 100644 --- a/scraper/scrapers/lapumia/lapumia_scraper.js +++ b/scraper/scrapers/lapumia/lapumia_scraper.js @@ -1,6 +1,6 @@ const moment = require("moment"); const Bottleneck = require("bottleneck"); -const leetx = require("./lapumia_api"); +const lapumia = require("./lapumia_api"); const { Type } = require("../../lib/types"); const repository = require("../../lib/repository"); const Promises = require("../../lib/promises"); @@ -20,114 +20,92 @@ async function scrape() { console.log(`[${scrapeStart}] starting ${NAME} scrape...`); return scrapeLatestTorrents() - .then(() => { - lastScrape.lastScraped = scrapeStart; - return lastScrape.save(); - }) - .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); } async function updateSeeders(torrent) { - return limiter.schedule(() => leetx.torrent(torrent.torrentId)); + return limiter.schedule(() => lapumia.torrent(torrent.torrentId)); } async function scrapeLatestTorrents() { const allowedCategories = [ - leetx.Categories.MOVIE, - leetx.Categories.TV, - leetx.Categories.ANIME, + lapumia.Categories.MOVIE ]; - return Promises.sequence( - allowedCategories.map( - (category) => () => scrapeLatestTorrentsForCategory(category) - ) - ).then((entries) => entries.reduce((a, b) => a.concat(b), [])); + return Promises.sequence(allowedCategories + .map((category) => () => scrapeLatestTorrentsForCategory(category))) + .then((entries) => entries.reduce((a, b) => a.concat(b), [])); } async function scrapeLatestTorrentsForCategory(category, page = 1) { - console.log({Scraper: `Scrapping ${NAME} ${category} category page ${page}`}); - return leetx - .browse({ category, page }) - .catch((error) => { - console.warn( - `Failed ${NAME} scrapping for [${page}] ${category} due: `, - error - ); - return Promise.resolve([]); - }) - .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processTorrentRecord(torrent))))) - .then((resolved) => resolved.length > 0 && page < untilPage(category) ? scrapeLatestTorrentsForCategory(category, page + 1) : Promise.resolve()); + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return lapumia + .browse({ category, page }) + .catch((error) => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent))))) + .then((resolved) => resolved.length > 0 && page < untilPage(category) + ? scrapeLatestTorrentsForCategory(category, page + 1) + : Promise.resolve()); } -async function processTorrentRecord(record) { - if (await checkAndUpdateTorrent({ provider: NAME, ...record })) { - return record; - } - const torrentEntrys = await leetx - .torrent(record.torrentId) - .catch(() => undefined); - if (torrentEntrys === undefined) { - return Promise.resolve([]) - } - return await Promise.allSettled( - torrentEntrys.map(async (torrentFound) => { - if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) { - return Promise.resolve("Invalid torrent record"); - } - if (isNaN(torrentFound.uploadDate)) { - console.warn( - `Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}` - ); - return; - } - if (await checkAndUpdateTorrent(torrentFound)) { - return torrentFound; - } - if (!torrentFound.size) { - await updateTorrentSize(torrentFound) - .catch((err) => Promise.resolve(err)) - } - if (!torrentFound.seeders) { - await updateCurrentSeeders(torrentFound) - .then(response => response.seeders === 0 ? delete response.seeders : response) - } - if (!torrentFound.imdbId) { - torrentFound.imdbId = await getImdbId(torrentFound.original_name, torrentFound.year, TYPE_MAPPING[torrentFound.category]) - } +async function processEntry(entry) { + return lapumia.torrent(entry.torrentId) + .then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record)))) + .catch(() => undefined); +} - const torrent = { - infoHash: torrentFound.infoHash, - provider: NAME, - torrentId: torrentFound.torrentId, - name: torrentFound.original_name, - title: torrentFound.name.replace(/\t|\s+/g, " ").trim(), - type: TYPE_MAPPING[torrentFound.category], - year: torrentFound.year, - imdbId: torrentFound.imdbId, - uploadDate: torrentFound.uploadDate, - seeders: torrentFound.seeders, - size: torrentFound.size, - files: torrentFound.files - }; - return createTorrentEntry(torrent); - }) - ); +async function processTorrentRecord(foundTorrent) { + if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) { + return foundTorrent; + } + + if (!foundTorrent.size) { + await updateTorrentSize(foundTorrent); + } + if (!Number.isInteger(foundTorrent.seeders)) { + await updateCurrentSeeders(foundTorrent); + } + if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) { + const info = { title: foundTorrent.originalName, year: foundTorrent.year }; + foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined); + } + + const torrent = { + infoHash: foundTorrent.infoHash, + provider: NAME, + torrentId: foundTorrent.torrentId, + title: foundTorrent.title, + type: TYPE_MAPPING[foundTorrent.category], + imdbId: foundTorrent.imdbId, + uploadDate: foundTorrent.uploadDate, + seeders: foundTorrent.seeders, + size: foundTorrent.size, + files: foundTorrent.files, + languages: foundTorrent.languages + }; + return createTorrentEntry(torrent); } function typeMapping() { const mapping = {}; - mapping[leetx.Categories.MOVIE] = Type.MOVIE; - mapping[leetx.Categories.TV] = Type.SERIES; - mapping[leetx.Categories.ANIME] = Type.ANIME; + mapping[lapumia.Categories.MOVIE] = Type.MOVIE; + mapping[lapumia.Categories.TV] = Type.SERIES; + mapping[lapumia.Categories.ANIME] = Type.ANIME; return mapping; } function untilPage(category) { - if (leetx.Categories.TV === category) { + if (lapumia.Categories.TV === category) { return 5; } - if (leetx.Categories.ANIME === category) { + if (lapumia.Categories.ANIME === category) { return 2; } return UNTIL_PAGE;