knightcrawler/scraper/scrapers/horriblesubs/horriblesubs_scraper.js

const fs = require('fs');
const moment = require('moment');
const Bottleneck = require('bottleneck');
const decode = require('magnet-uri');
const horriblesubs = require('./horriblesubs_api.js');
const repository = require('../../lib/repository');
const { Type } = require('../../lib/types');
const { updateCurrentSeeders } = require('../../lib/torrent');
const { parseTorrentFiles } = require('../../lib/torrentFiles');
const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json');

const NAME = 'HorribleSubs';
const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days;

const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });

async function scrape() {
  const scrapeStart = moment();
  const lastScrape = await repository.getProvider({ name: NAME });
  const lastScraped = lastScrape.lastScraped && moment.unix(lastScrape.lastScraped);

  if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
    console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
    return _scrapeAllShows()
        .then(() => {
          lastScrape.lastScraped = scrapeStart;
          return repository.updateProvider(lastScrape);
        })
        .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
  } else {
    console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
    return _scrapeLatestEntries()
        .then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
  }
}

async function _scrapeLatestEntries() {
  const latestEntries = await horriblesubs.getLatestEntries();

  return Promise.all(latestEntries
      .map((entryData) => limiter.schedule(() => _parseShowData(entryData)
          .catch((err) => console.log(err)))));
}

async function _scrapeAllShows() {
  const shows = await horriblesubs.allShows();

  return Promise.all(shows
      .map((show) => limiter.schedule(() => horriblesubs.showData(show)
          .then((showData) => _parseShowData(showData))
          .catch((err) => console.log(err)))));
}

async function compareSearchKitsuIds() {
  console.log(`${NAME}: initiating kitsu compare...`);
  const shows = await horriblesubs.allShows()
      .then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));

  const incorrect = shows.filter(
      (show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
  const incorrectRatio = incorrect.length / shows.length;
  console.log(incorrect);
  console.log(`Ratio: ${incorrectRatio}`);
}

async function initMapping() {
  console.log(`${NAME}: initiating kitsu mapping...`);
  const shows = await horriblesubs.allShows()
      .then((shows) => shows.filter((show) => !showMappings[show.title]))
      .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
      .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));

  fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
    if (err) {
      console.log("An error occurred while writing JSON Object to File.");
    } else {
      console.log(`${NAME}: finished kitsu mapping`);
    }
  });
}

async function enrichShow(show) {
  console.log(`${NAME}: getting show info for ${show.title}...`);
  const showId = await horriblesubs._getShowId(show.url)
      .catch(() => show.title);
  const metadata = await getKitsuId({ title: show.title })
      .then((kitsuId) => getMetadata(kitsuId))
      .catch((error) => {
        console.log(`Failed getting kitsu meta: ${error.message}`);
        return {};
      });

  return {
    showId: showId,
    kitsu_id: metadata.kitsuId,
    ...show,
    kitsuTitle: metadata.title,
    imdb_id: metadata.imdbId
  }
}

async function _parseShowData(showData) {
  console.log(`${NAME}: scrapping ${showData.title} data...`);
  const showMapping = showMappings[showData.title];
  const kitsuId = showMapping && showMapping.kitsu_id;
  if (!showMapping) {
    throw new Error(`No kitsu mapping found for ${showData.title}`);
  }
  if (!kitsuId) {
    throw new Error(`No kitsuId found for ${showData.title}`);
  }

  // sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
  const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
      .then((metas) => metas.reduce((map, meta) => {
        const epOffset = Object.keys(map).length;
        [...Array(meta.totalCount).keys()]
            .map(ep => ep + 1)
            .forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
        return map;
      }, {})) || {};
  const formatTitle = (episodeInfo, mirror) => {
    const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
    if (mapping) {
      return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
    }
    return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
  };
  const getKitsuId = inputEpisode => {
    const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
    const episode = parseInt(episodeString, 10);
    return kitsuIdsMapping[episode] && kitsuIdsMapping[episode].kitsuId || kitsuId;
  };

  return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
      .map((episodeInfo) => episodeInfo.mirrors
          .map((mirror) => ({
            provider: NAME,
            ...mirror,
            infoHash: decode(mirror.magnetLink).infoHash,
            trackers: decode(mirror.magnetLink).tr.join(','),
            title: formatTitle(episodeInfo, mirror),
            size: 300000000,
            type: Type.ANIME,
            kitsuId: getKitsuId(episodeInfo.episode),
            uploadDate: episodeInfo.uploadDate,
          })))
      .reduce((a, b) => a.concat(b), [])
      .map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
          .then((torrent) => torrent && updateCurrentSeeders(torrent))
          .then((torrent) => torrent && parseTorrentFiles(torrent)
              .then((files) => verifyFiles(torrent, files))
              .then((files) => repository.createTorrent(torrent)
                  .then(() => files.forEach(file => repository.createFile(file)))
                  .then(() => console.log(`Created entry for ${torrent.title}`)))))))
      .then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
}

async function verifyFiles(torrent, files) {
  if (files && files.length) {
    const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
        .then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {}))
        .catch(() => undefined);
    if (existingFiles && Object.keys(existingFiles).length) {
      return files
          .map(file => ({
            ...file,
            id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id,
            size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size
          }))
    }
    return files;
  }
  return Promise.reject(`No video files found for: ${torrent.title}`);
}

async function checkIfExists(torrent) {
  const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
  if (!existingTorrent) {
    return torrent; // no torrent exists yet
  } else if (existingTorrent.provider === NAME) {
    return undefined; // torrent by this provider already exists
  }
  return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
}

module.exports = { scrape };