Added back original scrapers, integrated with PGSQL

2024-12-20 03:29:51 +00:00 · 2024-01-17 16:20:00 -05:00
parent 364d1545b1
commit cab7f38c66
79 changed files with 65832 additions and 1702 deletions
--- a/scraper/scrapers/horriblesubs/horriblesubs_api.js
+++ b/scraper/scrapers/horriblesubs/horriblesubs_api.js
@@ -0,0 +1,137 @@
+const cheerio = require('cheerio');
+const needle = require('needle');
+const moment = require('moment');
+const Promises = require('../../lib/promises');
+
+const defaultUrl = 'https://horriblesubs.info';
+const defaultTimeout = 10000;
+
+function allShows(config = {}) {
+  return _getContent('/shows/', config)
+      .then(($) => $('div[class="ind-show"]')
+          .map((index, element) => $(element).children('a'))
+          .map((index, element) => ({
+            title: element.attr('title'),
+            url: `${config.proxyUrl || defaultUrl}${element.attr('href')}`
+          })).get());
+}
+
+async function showData(showInfo, config = {}) {
+  const showEndpoint = (showInfo.url || showInfo).match(/\/show.+/)[0];
+  const title = showInfo.title;
+  const showId = await _getShowId(showEndpoint);
+  const packEntries = await _getShowEntries(showId, title, 'batch', config);
+  const singleEntries = await _getShowEntries(showId, title, 'show', config);
+
+  return {
+    title: title,
+    url: showInfo.url || showInfo,
+    showId: showId,
+    singleEpisodes: singleEntries,
+    packEpisodes: packEntries
+  };
+}
+
+async function getLatestEntries(config = {}) {
+  return _getAllLatestEntries(config)
+      .then((entries) => Promises.sequence(entries.map((entry) => () => _findLatestEntry(entry, config))))
+      .then((entries) => entries.filter((entry) => entry))
+}
+
+function _getContent(endpoint, config = {},) {
+  const baseUrl = config.proxyUrl || defaultUrl;
+  const timeout = config.timeout || defaultTimeout;
+  const url = endpoint.startsWith('http')
+      ? endpoint.replace(/https?:\/\/[^/]+/, baseUrl)
+      : `${baseUrl}${endpoint}`;
+
+  return needle('get', url, { open_timeout: timeout })
+      .then((response) => response.body)
+      .then((body) => cheerio.load(body));
+}
+
+function _getShowId(showEndpoint) {
+  return _getContent(showEndpoint.replace(/(?:#\d+)?\/?$/, '/'))
+      .then($ => $('div.entry-content').find('script').html().match(/var hs_showid = (\d+)/)[1]);
+}
+
+function _getShowEntries(animeId, animeTitle, type, config) {
+  return _getAllEntries(animeId, type, config)
+      .then((entries) => entries.filter((entry) => entry.title === animeTitle));
+}
+
+function _getAllEntries(animeId, type, config, page = 0, autoExtend = true) {
+  const entriesEndpoint = `/api.php?method=getshows&type=${type}&showid=${animeId}&nextid=${page}`;
+  return _getEntries(entriesEndpoint, config)
+      .then((entries) => !autoExtend || !entries.length ? entries :
+          _getAllEntries(animeId, type, config, page + 1)
+              .then((nextEntries) => entries.concat(nextEntries)));
+}
+
+function _getEntries(endpoint, config) {
+  return _getContent(endpoint, config)
+      .then(($) => $('div[class="rls-info-container"]')
+          .map((index, element) => ({
+            title: $(element).find('a[class="rls-label"]').contents()
+                .filter((i, el) => el.nodeType === 3).first().text().trim(),
+            episode: $(element).find('a[class="rls-label"]').find('strong').text(),
+            uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
+            mirrors: $(element).find('div[class="rls-links-container"]').children()
+                .map((indexLink, elementLink) => ({
+                  resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
+                  magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
+                  torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
+                })).get()
+          })).get());
+}
+
+function _getAllLatestEntries(config, page = 0) {
+  const pageParam = page === 0 ? '' : `&nextid=${page}`;
+  const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
+  return _getContent(entriesEndpoint, config)
+      .then(($) => $('li a')
+          .map((index, element) => ({
+            urlEndpoint: $(element).attr('href'),
+            episode: $(element).find('strong').text()
+          })).get())
+      .then((entries) => entries.length < 12
+          ? entries
+          : _getAllLatestEntries(config, page + 1)
+              .then((nextEntries) => entries.concat(nextEntries)));
+}
+
+async function _findLatestEntry(entry, config) {
+  const showId = await _getShowId(entry.urlEndpoint);
+  let foundEntry;
+  let page = 0;
+  let reachedEnd = false;
+
+  while (!foundEntry && !reachedEnd) {
+    const allEntries = await _getAllEntries(showId, 'show', config, page, false);
+    foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
+    page = page + 1;
+    reachedEnd = allEntries.length === 0;
+  }
+
+  if (!foundEntry) {
+    return;
+  }
+  return {
+    title: foundEntry.title,
+    url: entry.urlEndpoint,
+    showId: showId,
+    singleEpisodes: [foundEntry]
+  };
+}
+
+function _parseDate(date) {
+  if (date.match(/today/i)) {
+    return moment().toDate();
+  } else if (date.match(/yesterday/i)) {
+    return moment().subtract(1, 'day').toDate();
+  }
+  return moment(date, 'MM/DD/YYYY').toDate();
+}
+
+module.exports = { allShows, showData, getLatestEntries, _getShowId };
+
--- a/scraper/scrapers/horriblesubs/horriblesubs_mapping.json
+++ b/scraper/scrapers/horriblesubs/horriblesubs_mapping.json
--- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
+++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
@@ -0,0 +1,186 @@
+const fs = require('fs');
+const moment = require('moment');
+const Bottleneck = require('bottleneck');
+const decode = require('magnet-uri');
+const horriblesubs = require('./horriblesubs_api.js');
+const repository = require('../../lib/repository');
+const { Type } = require('../../lib/types');
+const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent');
+const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
+const { getMetadata, getKitsuId } = require('../../lib/metadata');
+const showMappings = require('./horriblesubs_mapping.json');
+
+const NAME = 'HorribleSubs';
+const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;
+
+const limiter = new Bottleneck({ maxConcurrent: 5 });
+const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
+
+async function scrape() {
+  const scrapeStart = moment();
+  const lastScrape = await repository.getProvider({ name: NAME });
+  const lastScraped = lastScrape.lastScraped && moment(lastScrape.lastScraped);
+
+  if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
+    console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
+    return _scrapeAllShows()
+        .then(() => {
+          lastScrape.lastScraped = scrapeStart;
+          return lastScrape.save();
+        })
+        .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
+  } else {
+    console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
+    return _scrapeLatestEntries()
+        .then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
+  }
+}
+
+async function updateSeeders(torrent) {
+  return Promise.resolve([]);
+}
+
+async function _scrapeLatestEntries() {
+  const latestEntries = await horriblesubs.getLatestEntries();
+
+  return Promise.all(latestEntries
+      .map((entryData) => limiter.schedule(() => _parseShowData(entryData)
+          .catch((err) => console.log(err)))));
+}
+
+async function _scrapeAllShows() {
+  const shows = await horriblesubs.allShows();
+
+  return Promise.all(shows
+      .map((show) => limiter.schedule(() => horriblesubs.showData(show)
+          .then((showData) => _parseShowData(showData, false))
+          .catch((err) => console.log(err)))));
+}
+
+async function compareSearchKitsuIds() {
+  console.log(`${NAME}: initiating kitsu compare...`);
+  const shows = await horriblesubs.allShows()
+      .then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
+
+  const incorrect = shows.filter(
+      (show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
+  const incorrectRatio = incorrect.length / shows.length;
+  console.log(incorrect);
+  console.log(`Ratio: ${incorrectRatio}`);
+}
+
+async function initMapping() {
+  console.log(`${NAME}: initiating kitsu mapping...`);
+  const shows = await horriblesubs.allShows()
+      .then((shows) => shows.filter((show) => !showMappings[show.title]))
+      .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
+      .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
+
+  fs.writeFile(
+      "./scraper/scrapers/horriblesubs/horriblesubs_mapping.json",
+      JSON.stringify(shows), 'utf8',
+      (err) => {
+        if (err) {
+          console.log("An error occurred while writing JSON Object to File.", err);
+        } else {
+          console.log(`${NAME}: finished kitsu mapping`);
+        }
+      }
+  );
+}
+
+async function enrichShow(show) {
+  console.log(`${NAME}: getting show info for ${show.title}...`);
+  const showId = await horriblesubs._getShowId(show.url)
+      .catch(() => show.title);
+  const metadata = await getKitsuId({ title: show.title })
+      .then((kitsuId) => getMetadata(kitsuId))
+      .catch((error) => {
+        console.log(`Failed getting kitsu meta: ${error.message}`);
+        return {};
+      });
+
+  return {
+    showId: showId,
+    kitsu_id: metadata.kitsuId,
+    ...show,
+    kitsuTitle: metadata.title,
+    imdb_id: metadata.imdbId
+  }
+}
+
+async function _parseShowData(showData, updateSeeders = true) {
+  console.log(`${NAME}: scrapping ${showData.title} data...`);
+  const showMapping = showMappings[showData.title];
+  const kitsuId = showMapping && showMapping.kitsu_id;
+  if (!showMapping) {
+    throw new Error(`No kitsu mapping found for ${showData.title}`);
+  }
+  if (!kitsuId) {
+    throw new Error(`No kitsuId found for ${showData.title}`);
+  }
+
+  // sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
+  const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
+      .then((metas) => metas.reduce((map, meta) => {
+        const epOffset = Object.keys(map).length;
+        [...Array(meta.totalCount || 1).keys()]
+            .map(ep => ep + 1)
+            .forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
+        return map;
+      }, {})) || {};
+  const formatTitle = (episodeInfo, mirror) => {
+    const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
+    if (mapping) {
+      return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
+    }
+    return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
+  };
+  const getKitsuId = inputEpisode => {
+    const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
+    const episode = parseInt(episodeString, 10);
+    if (kitsuIdsMapping[episode]) {
+      return kitsuIdsMapping[episode].kitsuId;
+    } else if (Array.isArray(kitsuId)) {
+      console.warn(`Unmapped episode number for ${showData.title} - ${inputEpisode}`);
+      return undefined;
+    }
+    return kitsuId;
+  };
+
+  return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
+      .map(episodeInfo => episodeInfo.mirrors
+          .filter(mirror => mirror.magnetLink && mirror.magnetLink.length)
+          .map(mirror => ({
+            provider: NAME,
+            ...mirror,
+            infoHash: decode(mirror.magnetLink).infoHash,
+            trackers: decode(mirror.magnetLink).tr.join(','),
+            title: formatTitle(episodeInfo, mirror),
+            type: Type.ANIME,
+            kitsuId: getKitsuId(episodeInfo.episode),
+            uploadDate: episodeInfo.uploadDate,
+          })))
+      .reduce((a, b) => a.concat(b), [])
+      .filter(torrent => torrent.kitsuId)
+      .map(torrent => entryLimiter.schedule(() => processTorrentRecord(torrent, updateSeeders))))
+      .then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
+}
+
+async function processTorrentRecord(torrent, updateSeeders = true) {
+  const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
+
+  if (existingTorrent && existingTorrent.provider === NAME) {
+    if (updateSeeders) {
+      return updateCurrentSeeders(torrent).then(updatedSeeders => checkAndUpdateTorrent(updatedSeeders))
+    }
+    return Promise.resolve(torrent)
+  }
+
+  return updateTorrentSize(torrent)
+      .then(updated => updateCurrentSeeders(updated))
+      .then(updated => createTorrentEntry(updated, true))
+      .catch(error => console.warn(`Failed creating entry for ${torrent.title}:`, error));
+}
+
+module.exports = { scrape, updateSeeders, NAME };