remove scraper module

2024-12-20 03:29:51 +00:00 · 2022-06-16 17:44:38 +03:00
parent a253e8e0b0
commit 554c07d636
69 changed files with 0 additions and 61771 deletions
--- a/scraper/scrapers/lapumia/lapumia_api.js
+++ b/scraper/scrapers/lapumia/lapumia_api.js
@@ -1,145 +0,0 @@
-const axios = require('axios');
-const moment = require("moment")
-const cheerio = require("cheerio");
-const decode = require('magnet-uri');
-const { escapeHTML } = require('../../lib/metadata');
-const { getRandomUserAgent } = require('../../lib/requestHelper');
-const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper')
-
-const defaultTimeout = 10000;
-const maxSearchPage = 50
-
-const baseUrl = 'https://lapumia.org';
-
-const Categories = {
-  MOVIE: null,
-  TV: 'series',
-  ANIME: 'animes',
-};
-
-function torrent(torrentId, config = {}, retries = 2) {
-  if (!torrentId || retries === 0) {
-    return Promise.reject(new Error(`Failed ${torrentId} query`));
-  }
-  return singleRequest(`${baseUrl}/${torrentId}`, config)
-      .then((body) => parseTorrentPage(body))
-      .then((torrent) => torrent.map(el => ({ torrentId, ...el })))
-      .catch((err) => {
-        console.warn(`Failed Lapumia ${torrentId} request: `, err);
-        return torrent(torrentId, config, retries - 1)
-      });
-}
-
-function search(keyword, config = {}, retries = 2) {
-  if (!keyword || retries === 0) {
-    return Promise.reject(new Error(`Failed ${keyword} search`));
-  }
-  const page = config.page || 1;
-  const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
-
-  return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
-      .then(body => parseTableBody(body))
-      .then(torrents => torrents.length === 10 && page < extendToPage
-          ? search(keyword, { ...config, page: page + 1 }).catch(() => [])
-              .then(nextTorrents => torrents.concat(nextTorrents))
-          : torrents)
-      .catch((err) => search(keyword, config, retries - 1));
-}
-
-function browse(config = {}, retries = 2) {
-  if (retries === 0) {
-    return Promise.reject(new Error(`Failed browse request`));
-  }
-  const page = config.page || 1;
-  const category = config.category;
-  const requestUrl = category ? `${baseUrl}/${category}/page/${page}/` : `${baseUrl}/page/${page}/`
-
-  return singleRequest(requestUrl, config)
-      .then((body) => parseTableBody(body))
-      .catch((err) => browse(config, retries - 1));
-}
-
-function singleRequest(requestUrl, config = {}) {
-  const timeout = config.timeout || defaultTimeout;
-  const options = { userAgent: getRandomUserAgent(), timeout: timeout, follow: 2 };
-
-  return axios.get(requestUrl, options)
-      .then((response) => {
-        const body = response.data;
-        if (!body) {
-          throw new Error(`No body: ${requestUrl}`);
-        } else if (body.includes('502: Bad gateway') ||
-            body.includes('403 Forbidden')) {
-          throw new Error(`Invalid body contents: ${requestUrl}`);
-        }
-        return body;
-      })
-      .catch(error => Promise.reject(error.message || error));
-}
-
-function parseTableBody(body) {
-  return new Promise((resolve, reject) => {
-    const $ = cheerio.load(body);
-
-    if (!$) {
-      reject(new Error('Failed loading body'));
-    }
-
-    const torrents = [];
-
-    $('div.post').each((i, element) => {
-      const row = $(element);
-      try {
-        torrents.push({
-          name: row.find("div > a").text(),
-          torrentId: row.find("div > a").attr("href").split('/')[3]
-        });
-      } catch (e) {
-        console.log("Failed parsing Lupumia table entry")
-      }
-    });
-    resolve(torrents);
-  });
-}
-
-function parseTorrentPage(body) {
-  return new Promise((resolve, reject) => {
-    const $ = cheerio.load(body);
-
-    if (!$) {
-      reject(new Error('Failed loading body'));
-    }
-    const magnets = $('h2 > span')
-        .filter((i, elem) => isPtDubbed($(elem).text())).parent()
-        .map((i, elem) => $(elem).nextUntil('h2, hr'))
-        .map((i, elem) => $(elem).find('a[href^="magnet"]'))
-        .map((i, section) => $(section).attr("href")).get();
-    const category = parseCategory($('div.category').html());
-    const details = $('div.content')
-    const torrents = magnets.filter(magnetLink => decode(magnetLink).name).map(magnetLink => ({
-      title: sanitizePtName(escapeHTML(decode(magnetLink).name.replace(/\+/g, ' '))),
-      originalName: sanitizePtOriginalName(details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue),
-      year: details.find('b:contains(\'Ano de Lançamento:\')')[0].nextSibling.nodeValue.trim(),
-      infoHash: decode(magnetLink).infoHash,
-      magnetLink: magnetLink,
-      category: category,
-      uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', 'pt-br').format()),
-      imdbId: $('.imdbRatingPlugin').attr('data-title') || null,
-      languages: sanitizePtLanguages(details.find('b:contains(\'Idioma\')')[0].nextSibling.nodeValue)
-    }))
-    resolve(torrents.filter((x) => x));
-  });
-}
-
-function parseCategory(categorys) {
-  const $ = cheerio.load(categorys)
-  if ($('a:contains(\'Animes\')').text()) {
-    return Categories.ANIME
-  }
-  if ($('a:contains(\'Series\')').text()) {
-    return Categories.TV
-  }
-  return Categories.MOVIE
-}
-
-module.exports = { torrent, search, browse, Categories };
--- a/scraper/scrapers/lapumia/lapumia_scraper.js
+++ b/scraper/scrapers/lapumia/lapumia_scraper.js
@@ -1,112 +0,0 @@
-const moment = require("moment");
-const Bottleneck = require("bottleneck");
-const lapumia = require("./lapumia_api");
-const { Type } = require("../../lib/types");
-const repository = require("../../lib/repository");
-const Promises = require("../../lib/promises");
-const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
-const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
-const { getImdbId } = require("../../lib/metadata");
-
-const NAME = "Lapumia";
-const UNTIL_PAGE = 5;
-const TYPE_MAPPING = typeMapping();
-
-const limiter = new Bottleneck({ maxConcurrent: 5 });
-
-async function scrape() {
-  const scrapeStart = moment();
-  const lastScrape = await repository.getProvider({ name: NAME });
-  console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
-
-  return scrapeLatestTorrents()
-      .then(() => {
-        lastScrape.lastScraped = scrapeStart;
-        return lastScrape.save();
-      })
-      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
-}
-
-async function updateSeeders(torrent) {
-  return limiter.schedule(() => lapumia.torrent(torrent.torrentId));
-}
-
-async function scrapeLatestTorrents() {
-  const allowedCategories = [
-    lapumia.Categories.MOVIE,
-    lapumia.Categories.TV
-  ];
-
-  return Promises.sequence(allowedCategories
-          .map((category) => () => scrapeLatestTorrentsForCategory(category)))
-      .then((entries) => entries.reduce((a, b) => a.concat(b), []));
-}
-
-async function scrapeLatestTorrentsForCategory(category, page = 1) {
-  console.log(`Scrapping ${NAME} ${category} category page ${page}`);
-  return lapumia
-      .browse({ category, page })
-      .catch((error) => {
-        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
-        return Promise.resolve([]);
-      })
-      .then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
-      .then((resolved) => resolved.length > 0 && page < untilPage(category)
-          ? scrapeLatestTorrentsForCategory(category, page + 1)
-          : Promise.resolve());
-}
-
-async function processEntry(entry) {
-  return lapumia.torrent(entry.torrentId)
-      .then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
-      .catch(() => undefined);
-}
-
-async function processTorrentRecord(foundTorrent) {
-  if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
-    return foundTorrent;
-  }
-
-  if (!foundTorrent.size) {
-    await updateTorrentSize(foundTorrent);
-  }
-  if (!Number.isInteger(foundTorrent.seeders)) {
-    await updateCurrentSeeders(foundTorrent);
-  }
-  if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) {
-    const info = { title: foundTorrent.originalName, year: foundTorrent.year };
-    foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
-  }
-
-  const torrent = {
-    infoHash: foundTorrent.infoHash,
-    provider: NAME,
-    torrentId: foundTorrent.torrentId,
-    title: foundTorrent.title,
-    type: TYPE_MAPPING[foundTorrent.category],
-    imdbId: foundTorrent.imdbId,
-    uploadDate: foundTorrent.uploadDate,
-    seeders: foundTorrent.seeders,
-    size: foundTorrent.size,
-    files: foundTorrent.files,
-    languages: foundTorrent.languages
-  };
-  return createTorrentEntry(torrent);
-}
-
-function typeMapping() {
-  const mapping = {};
-  mapping[lapumia.Categories.MOVIE] = Type.MOVIE;
-  mapping[lapumia.Categories.TV] = Type.SERIES;
-  mapping[lapumia.Categories.ANIME] = Type.ANIME;
-  return mapping;
-}
-
-function untilPage(category) {
-  if (lapumia.Categories.ANIME === category) {
-    return 2;
-  }
-  return UNTIL_PAGE;
-}
-
-module.exports = { scrape, updateSeeders, NAME };