updates the package structure and updates initial horriblesubs scraper WIP

2020-01-07 13:33:46 +01:00
parent 643917939b
commit 051c50de3f
12 changed files with 264 additions and 280 deletions
--- a/index.js
+++ b/index.js
@@ -2,10 +2,10 @@ require('dotenv').config();
 const express = require("express");
 const server = express();
 const { connect } = require('./lib/repository');
-const tpbDump = require('./scrapers/piratebay_dump');
-const horribleSubsScraper = require('./scrapers/horiblesubs_scraper');
+const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper');
+const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');

-const providers = [tpbDump];
+const providers = [horribleSubsScraper];

 async function scrape() {
  providers.forEach((provider) => provider.scrape());
--- a/lib/cache.js
+++ b/lib/cache.js
@@ -11,6 +11,21 @@ const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days
 const MONGO_URI = process.env.MONGODB_URI;

 const cache = initiateCache();
+const torrentFilesCache = initiateTorrentFilesCache();
+
+function initiateTorrentFilesCache() {
+  if (MONGO_URI) {
+    return cacheManager.caching({
+      store: mangodbStore,
+      uri: MONGO_URI,
+      options: {
+        collection: 'cacheManager',
+      },
+      ttl: GLOBAL_TTL,
+      ignoreCacheErrors: true
+    });
+  }
+}

 function initiateCache() {
  if (MONGO_URI) {
@@ -18,7 +33,7 @@ function initiateCache() {
      store: mangodbStore,
      uri: MONGO_URI,
      options: {
-        collection: 'cacheManager',
+        collection: 'torrentio_scraper_collection',
      },
      ttl: GLOBAL_TTL,
      ignoreCacheErrors: true
@@ -32,7 +47,7 @@ function initiateCache() {
 }

 function retrieveTorrentFiles(infoHash) {
-  return cache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`)
+  return torrentFilesCache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`)
      .then((results) => {
        if (!results) {
          throw new Error('No cached files found');
--- a/lib/metadata.js
+++ b/lib/metadata.js
@@ -2,52 +2,46 @@ const needle = require('needle');
 const nameToImdb = require('name-to-imdb');
 const bing = require('nodejs-bing');
 const { cacheWrapImdbId, cacheWrapMetadata } = require('./cache');
+const { Type } = require('./types');

 const CINEMETA_URL = 'https://v3-cinemeta.strem.io';
 const KITSU_URL = 'https://anime-kitsu.now.sh';

-function getMetadata(imdbId, type) {
-  return cacheWrapMetadata(imdbId,
-      () => needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 60000 })
-          .then((response) => {
-            const body = response.body;
-            if (body && body.meta && body.meta.name) {
-              return {
-                imdbId: imdbId,
-                title: body.meta.name,
-                year: body.meta.year,
-                genres: body.meta.genres,
-                totalEpisodes: body.meta.videos && body.meta.videos
-                    .filter(video => video.season > 0).length,
-                episodeCount: body.meta.videos && Object.values(body.meta.videos
-                    .filter((entry) => entry.season !== 0)
-                    .sort((a, b) => a.season - b.season)
-                    .reduce((map, next) => {
-                      map[next.season] = map[next.season] + 1 || 1;
-                      return map;
-                    }, {}))
-              };
-            } else {
-              throw new Error('No search results');
-            }
-          })
-          .catch((error) => {
-            throw new Error(`failed cinemeta query ${imdbId} due: ${error.message}`);
-          }));
-}
-
-function getKitsuMetadata(kitsuId) {
-  const key = kitsuId.startsWith('kitsu:') ? kitsuId : `kitsu:${kitsuId}`;
+function getMetadata(id, type = Type.SERIES ) {
+  const key = id.match(/^\d+$/) ? `kitsu:${id}` : id;
  return cacheWrapMetadata(key,
      () => needle('get', `${KITSU_URL}/meta/series/${key}.json`, { open_timeout: 60000 })
      .then((response) => {
        const body = response.body;
        if (body && body.meta && body.meta.id) {
          return {
-            ...body.meta,
-            videos: undefined,
-            totalEpisodes: body.meta.videos && body.meta.videos
-              .filter(video => video.season > 0).length
+            kitsuId: body.kitsu_id,
+            imdbId: body.imdb_id,
+            title: body.meta.name,
+            year: body.meta.year,
+            genres: body.meta.genres,
+            videos: body.meta.videos && body.meta.videos
+                .map((video) => video.imdbSeason
+                    ? {
+                      episode: video.episode,
+                      imdbSeason: video.imdbSeason,
+                      imdbEpisode: video.imdbEpisode
+                    }
+                    : {
+                      season: video.season,
+                      episode: video.episode,
+                      kitsuId: video.kitsu_id,
+                      kitsuEpisode: video.kitsuEpisode,
+                      released: video.released
+                    }
+                ),
+            episodeCount: body.meta.videos && Object.values(body.meta.videos
+                .filter((entry) => entry.season !== 0)
+                .sort((a, b) => a.season - b.season)
+                .reduce((map, next) => {
+                  map[next.season] = map[next.season] + 1 || 1;
+                  return map;
+                }, {}))
          };
        } else {
          throw new Error('No search results');
@@ -101,4 +95,4 @@ async function getKitsuId(title) {
        }));
 }

-module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuMetadata, getKitsuId };
+module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuId };
--- a/lib/torrent.js
+++ b/lib/torrent.js
@@ -6,6 +6,10 @@ const { retrieveTorrentFiles } = require('./cache');
 const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
 const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"];

+module.exports.updateCurrentSeeders = function(torrent) {
+  return Promise.resolve({ ...torrent, seeders: 0});
+};
+
 module.exports.torrentFiles = function(torrent) {
  return filesFromTorrentFile(torrent)
      .catch(() => filesFromTorrentStream(torrent))
@@ -28,7 +32,7 @@ async function filesFromTorrentFile(torrent) {
    return Promise.reject(new Error("no torrentLink"));
  }

-  needle('get', torrent.torrentLink, { open_timeout: 2000 })
+  return needle('get', torrent.torrentLink, { open_timeout: 10000 })
      .then((response) => {
        if (!response.body || response.statusCode !== 200) {
          throw new Error('torrent not found')
--- a/lib/torrentFiles.js
+++ b/lib/torrentFiles.js
@@ -5,8 +5,14 @@ const { Type } = require('./types');

 const MIN_SIZE = 20 * 1024 * 1024; // 20 MB

-async function parseTorrentFiles(torrent, imdbId) {
+async function parseTorrentFiles(torrent, imdbId, kitsuId) {
  const parsedTorrentName = parse(torrent.title);
+  parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/);
+  const metadata = await getMetadata(kitsuId || imdbId, torrent.type || Type.MOVIE).catch(() => undefined);
+
+  if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) {
+    throw new Error(`Mismatching entry type for ${torrent.name}: ${torrent.type}!=${metadata.type}`);
+  }

  if (torrent.type === Type.MOVIE) {
    if (parsedTorrentName.complete) {
@@ -27,31 +33,20 @@ async function parseTorrentFiles(torrent, imdbId) {
          });
    }

-    return [{
+    return [ {
      infoHash: torrent.infoHash,
      title: torrent.title,
      size: torrent.size,
-      imdbId: imdbId,
-    }];
+      imdbId: imdbId || metadata && metadata.imdb_id,
+      kitsuId: kitsuId || metadata && metadata.kitsu_id
+    } ];
  }

-  if (parsedTorrentName.season && parsedTorrentName.episode) {
-    return [{
-      infoHash: torrent.infoHash,
-      title: torrent.title,
-      size: torrent.size,
-      imdbId: imdbId,
-      imdbSeason: parsedTorrentName.season,
-      imdbEpisode: parsedTorrentName.episode
-    }];
-  }
-
-  parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/);
-  return torrentFiles(torrent)
+  return getSeriesFiles(torrent, parsedTorrentName)
      .then((files) => files
          .filter((file) => file.size > MIN_SIZE)
          .map((file) => parseSeriesFile(file, parsedTorrentName)))
-      .then((files) => decomposeAbsoluteEpisodes(files, torrent, imdbId))
+      .then((files) => decomposeAbsoluteEpisodes(files, metadata))
      .then((files) => Promise.all(files.map(file => file.isMovie
          ? mapSeriesMovie(file, torrent.infoHash)
          : mapSeriesEpisode(file, torrent.infoHash, imdbId))))
@@ -62,6 +57,18 @@ async function parseTorrentFiles(torrent, imdbId) {
      });
 }

+async function getSeriesFiles(torrent, parsedTorrentName) {
+  if (parsedTorrentName.episode || parsedTorrentName.date) {
+    return [ {
+      name: torrent.title,
+      path: torrent.title,
+      size: torrent.size
+    } ];
+  }
+
+  return torrentFiles(torrent);
+}
+
 async function mapSeriesEpisode(file, infoHash, imdbId) {
  if (!file.episodes) {
    return Promise.resolve([]);
@@ -69,21 +76,22 @@ async function mapSeriesEpisode(file, infoHash, imdbId) {
  return Promise.resolve(file.episodes.map(episode => ({
    infoHash: infoHash,
    fileIndex: file.fileIndex,
-    title: file.name,
+    title: file.path || file.name,
    size: file.size,
    imdbId: imdbId,
    imdbSeason: file.season,
-    imdbEpisode: episode})))
+    imdbEpisode: episode
+  })))
 }

 async function mapSeriesMovie(file, infoHash) {
-  return findMovieImdbId(file).then((imdbId) => [{
+  return findMovieImdbId(file).then((imdbId) => [ {
    infoHash: infoHash,
    fileIndex: file.fileIndex,
    title: file.name,
    size: file.size,
    imdbId: imdbId
-  }])
+  } ])
 }

 function parseSeriesFile(file, parsedTorrentName) {
@@ -96,7 +104,8 @@ function parseSeriesFile(file, parsedTorrentName) {
    const pathInfo = parse(folders[folders.length - 2]);
    fileInfo.season = pathInfo.season;
  }
-  fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season && !fileInfo.episodes || !!fileInfo.year;
+  fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season &&
+      (!fileInfo.episodes || !!fileInfo.year || !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i));

  return { ...file, ...fileInfo };
 }
@@ -111,12 +120,11 @@ function findMovieImdbId(title) {
  return getImdbId(searchQuery).catch((error) => undefined);
 }

-async function decomposeAbsoluteEpisodes(files, torrent, imdbId) {
+async function decomposeAbsoluteEpisodes(files, metadata) {
  if (files.every((file) => !file.episodes || file.episodes.every((ep) => ep < 100))) {
    return files; // nothing to decompose
  }

-  const metadata = await getMetadata(imdbId, torrent.type || Type.MOVIE);
  // decompose if season is inside path, but individual files are concatenated ex. 101 (S01E01)
  files
      .filter(file => file.season && metadata.episodeCount[file.season] < 100)
--- a/scrapers/manual/manual.js
+++ b/scrapers/manual/manual.js
@@ -1,11 +1,11 @@
 require('dotenv').config();
-const repository = require('../../lib/repository');
-const { parseTorrentFiles } = require('../../lib/torrentFiles');
-const { Type } = require('../../lib/types');
+const repository = require('../lib/repository');
+const { parseTorrentFiles } = require('../lib/torrentFiles');
+const { Type } = require('../lib/types');

 async function addMissingEpisodes() {
  const torrent = { infoHash: '0ec780c2c7f8d5b38e61827f0b53c77c3d22f955' };
-  const torrentFiles = await require('../../lib/torrent').torrentFiles(torrent);
+  const torrentFiles = await require('../lib/torrent').torrentFiles(torrent);
  const storedFiles = await repository.getFiles(torrent)
      .then((files) => files.reduce((map, next) => (map[next.fileIndex] = next, map), {}));
  const imdbId = Object.values(storedFiles)[0].imdbId;
--- a/scrapers/horiblesubs_scraper.js
+++ b/scrapers/horiblesubs_scraper.js
@@ -1,172 +0,0 @@
-const moment = require('moment');
-const fs = require('fs');
-const needle = require('needle');
-const Bottleneck = require('bottleneck');
-const { parse } = require('parse-torrent-title');
-const decode = require('magnet-uri');
-const horriblesubs = require('./api/horriblesubs');
-const { Type } = require('../lib/types');
-const { torrentFiles, currentSeeders } = require('../lib/torrent');
-const repository = require('../lib/repository');
-const { getImdbId, getMetadata, getKitsuId, getKitsuMetadata } = require('../lib/metadata');
-
-const NAME = 'HorribleSubs';
-
-const limiter = new Bottleneck({maxConcurrent: 5});
-const entryLimiter = new Bottleneck({maxConcurrent: 20});
-
-async function scrape() {
-  const lastScraped = await repository.getProvider({ name: NAME });
-
-  if (!lastScraped.lastScraped) {
-    console.log(`${NAME}: no previous scrapping exist`);
-    await _scrapeAllShows()
-  }
-}
-
-async function _scrapeAllShows() {
-  initMapping();
-  // console.log(`${NAME}: getting all shows...`);
-  // const shows = await horriblesubs.allShows();
-
-  // Promise.all(shows
-  //     .slice(0, 20)
-  //     //.filter(show => show.url.includes('piece'))
-  //     .map((show) => limiter.schedule(() => horriblesubs.showData(show)
-  //     .then((showData) => _parseShowData(showData))
-  //     .catch((err) => console.log(err)))));
-}
-
-async function initMapping() {
-  console.log(`${NAME}: initiating kitsu mapping...`);
-  const currentMapping = require('../horrible_subs_mapping');
-  const mappings = Object.values(currentMapping);
-  const shows = await horriblesubs.allShows()
-      .then((shows) => shows.filter((show) => !mappings.find((mapping) => mapping.title === show.title)))
-      .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
-      .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), currentMapping));
-
-  fs.writeFile("./horrible_subs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
-    if (err) {
-      console.log("An error occurred while writing JSON Object to File.");
-    }
-  });
-  console.log(`${NAME}: finished kitsu mapping`);
-}
-
-async function enrichShow(show) {
-  console.log(`${NAME}: getting show info for ${show.title}...`);
-  const showId = await horriblesubs._getShowId(show.url)
-    .catch((error) => show.title);
-  const metadata = await getKitsuId(show.title)
-    .then((kitsuId) => getKitsuMetadata(kitsuId))
-    .catch((error) => {
-      console.log(`Failed getting kitsu meta: ${error.message}`);
-      return {};
-    });
-
-  return {
-    showId: showId,
-    ...show,
-    kitsu_id: metadata.kitsu_id,
-    kitsuTitle: metadata.name,
-    kitsuSlug: metadata.slug,
-    imdb_id: metadata.imdb_id
-  }
-}
-
-async function _parseShowData(showData) {
-  console.log(`${NAME}: scrapping ${showData.title} data...`);
-  const imdbId = hardcodedShows[showData.showId] || await getImdbId({
-    name: showData.title.replace(/\W+/g, ' ').toLowerCase(),
-    type: 'series'
-  }).catch(() => undefined);
-  const metadata = imdbId && await getMetadata(imdbId, 'series') || {};
-
-  return Promise.all([
-      showData.singleEpisodes
-          .map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror))))
-          .reduce((a, b) => a.concat(b), []),
-      showData.packEpisodes
-          .map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror))))
-          .reduce((a, b) => a.concat(b), [])
-  ].reduce((a, b) => a.concat(b), []))
-      .then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent)));
-}
-
-async function _constructSingleEntry(metadata, single, mirror) {
-  mirror.infoHash = decode(mirror.magnetLink).infoHash;
-  const seeders = await currentSeeders(mirror);
-  const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/);
-  const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season
-  const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode
-  const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode);
-  const title = `${single.title} ${single.episode} [${mirror.resolution}]`;
-  const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode};
-
-  return {
-    infoHash: mirror.infoHash,
-    provider: NAME,
-    title: title,
-    type: Type.ANIME,
-    imdbId: metadata.imdbId,
-    uploadDate: single.uploadDate,
-    seeders: seeders,
-    files: [file]
-  }
-}
-
-async function _constructPackEntry(metadata, pack, mirror) {
-  mirror.infoHash = decode(mirror.magnetLink).infoHash;
-  const seeders = await currentSeeders(mirror);
-  const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/);
-  const xSeason = seasonMatch && parseInt(seasonMatch[1]);
-
-  const files = await torrentFiles(mirror)
-      .then((files) => files.map((file) => {
-        const title = file.path.match(/[^\/]+$/)[0];
-        const titleInfo = parse(title.replace(pack.title, ''));
-        return titleInfo.episodes
-            .map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode))
-            .map((actual) => ({
-              title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode
-            }));
-      }))
-      .then((files) => files.reduce((a, b) => a.concat(b), []))
-      .catch(() => []);
-
-  return {
-    infoHash: mirror.infoHash,
-    provider: NAME,
-    title: `${pack.title} ${pack.episode} [${mirror.resolution}]`,
-    type: 'anime',
-    imdbId: metadata.imdbId,
-    uploadDate: pack.uploadDate,
-    seeders: seeders,
-    files: files
-  }
-}
-
-function actualSeasonEpisode(metadata, xSeason, xEpisode) {
-  if (xSeason) {
-    return {
-      season: xSeason,
-      episode: xEpisode,
-      absoluteEpisode: metadata.episodeCount && metadata.episodeCount
-          .slice(0, xSeason - 1)
-          .reduce((a, b) => a + b, xEpisode),
-    }
-  } else if (metadata.episodeCount) {
-    return metadata.episodeCount
-        .reduce((epInfo, epCount) => {
-          if (epInfo.episode > epCount) {
-            epInfo.season = epInfo.season + 1;
-            epInfo.episode = epInfo.episode - epCount;
-          }
-          return epInfo;
-        }, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode })
-  }
-  return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode }
-}
-
-module.exports = { scrape };
--- a/scrapers/horriblesubs/horriblesubs_api.js
+++ b/scrapers/horriblesubs/horriblesubs_api.js
@@ -1,13 +1,14 @@
 const cheerio = require('cheerio');
 const needle = require('needle');
 const moment = require('moment');
+const decode = require('magnet-uri');

 const defaultUrl = 'https://horriblesubs.info';
 const defaultTimeout = 5000;

 function allShows(config = {}) {
  return _getContent('/shows', config)
-      .then(($) =>  $('div[class="ind-show"]')
+      .then(($) => $('div[class="ind-show"]')
          .map((index, element) => $(element).children('a'))
          .map((index, element) => ({
            title: element.attr('title'),
@@ -71,11 +72,12 @@ function _getEntries(endpoint, config) {
            episode: $(element).find('a[class="rls-label"]').find('strong').text(),
            uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
            mirrors: $(element).find('div[class="rls-links-container"]').children()
-                  .map((indexLink, elementLink) => ({
-                    resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
-                    magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
-                    torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
-                  })).get()
+                .map((indexLink, elementLink) => ({
+                  resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
+                  infoHash: decode($(elementLink).find('a[title="Magnet Link"]').attr('href')).infoHash,
+                  magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
+                  torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
+                })).get()
          })).get())
 }

@@ -84,13 +86,14 @@ function _getAllLatestEntries(config, page = 0) {
  const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
  return _getContent(entriesEndpoint, config)
      .then(($) => $('li a')
-      .map((index, element) => ({
-        urlEndpoint: $(element).attr('href'),
-        episode: $(element).find('strong').text()
-      })).get())
-      .then((entries) => entries.length < 12 ? entries :
-          _getAllLatestEntries(config, page + 1)
-          .then((nextEntries) => entries.concat(nextEntries)))
+          .map((index, element) => ({
+            urlEndpoint: $(element).attr('href'),
+            episode: $(element).find('strong').text()
+          })).get())
+      .then((entries) => entries.length < 12
+          ? entries
+          : _getAllLatestEntries(config, page + 1)
+              .then((nextEntries) => entries.concat(nextEntries)))
 }

 async function _findLatestEntry(entry, config) {
@@ -99,8 +102,8 @@ async function _findLatestEntry(entry, config) {
  let page = 0;
  let reachedEnd = false;

-  while(!foundEntry && !reachedEnd) {
-    const allEntries = await _getAllEntries(showId, 'show', config, page,false);
+  while (!foundEntry && !reachedEnd) {
+    const allEntries = await _getAllEntries(showId, 'show', config, page, false);
    foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
    page = page + 1;
    reachedEnd = allEntries.length === 0;
--- a/scrapers/horriblesubs/horriblesubs_mapping.json
+++ b/scrapers/horriblesubs/horriblesubs_mapping.json
--- a/scrapers/horriblesubs/horriblesubs_scraper.js
+++ b/scrapers/horriblesubs/horriblesubs_scraper.js
@@ -0,0 +1,122 @@
+const fs = require('fs');
+const Bottleneck = require('bottleneck');
+const { parse } = require('parse-torrent-title');
+const horriblesubs = require('./horriblesubs_api.js');
+const repository = require('../../lib/repository');
+const { Type } = require('../../lib/types');
+const { updateCurrentSeeders } = require('../../lib/torrent');
+const { parseTorrentFiles } = require('../../lib/torrentFiles');
+const { getMetadata, getKitsuId } = require('../../lib/metadata');
+const showMappings = require('./horriblesubs_mapping.json');
+
+const NAME = 'HorribleSubs';
+
+const limiter = new Bottleneck({ maxConcurrent: 5 });
+const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
+
+async function scrape() {
+  const lastScraped = await repository.getProvider({ name: NAME });
+
+  if (!lastScraped.lastScraped) {
+    console.log(`${NAME}: no previous scrapping exist`);
+    await _scrapeAllShows()
+  }
+}
+
+async function _scrapeAllShows() {
+  console.log(`${NAME}: getting all shows...`);
+  const shows = await horriblesubs.allShows();
+
+  return Promise.all(shows
+      .slice(0, 20)
+      .map((show) => limiter.schedule(() => horriblesubs.showData(show)
+          .then((showData) => _parseShowData(showData))
+          .catch((err) => console.log(err)))));
+}
+
+async function initMapping() {
+  console.log(`${NAME}: initiating kitsu mapping...`);
+  const shows = await horriblesubs.allShows()
+      .then((shows) => shows.filter((show) => !showMappings[show.title]))
+      .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
+      .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
+
+  fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
+    if (err) {
+      console.log("An error occurred while writing JSON Object to File.");
+    } else {
+      console.log(`${NAME}: finished kitsu mapping`);
+    }
+  });
+}
+
+async function enrichShow(show) {
+  console.log(`${NAME}: getting show info for ${show.title}...`);
+  const showId = await horriblesubs._getShowId(show.url)
+      .catch((error) => show.title);
+  const metadata = await getKitsuId(show.title)
+      .then((kitsuId) => getMetadata(kitsuId))
+      .catch((error) => {
+        console.log(`Failed getting kitsu meta: ${error.message}`);
+        return {};
+      });
+
+  return {
+    showId: showId,
+    ...show,
+    kitsu_id: metadata.kitsu_id,
+    kitsuTitle: metadata.name,
+    imdb_id: metadata.imdb_id
+  }
+}
+
+async function _parseShowData(showData) {
+  console.log(`${NAME}: scrapping ${showData.title} data...`);
+  const showMapping = showMappings[showData.title];
+  const kitsuId = showMapping && showMapping.kitsu_id;
+  if (!showMapping) {
+    throw new Error(`No kitsu mapping found for ${showData.title}`);
+  }
+  if (!kitsuId) {
+    throw new Error(`No kitsuId found for ${showData.title}`);
+  }
+
+  return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
+      .map((episodeInfo) => episodeInfo.mirrors
+          .map((mirror) => ({
+            provider: NAME,
+            ...mirror,
+            title: `${episodeInfo.title} ${episodeInfo.episode} [${mirror.resolution}]`,
+            size: 300000000,
+            type: Type.ANIME,
+            uploadDate: episodeInfo.uploadDate,
+          })))
+      .reduce((a, b) => a.concat(b), [])
+      .map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
+          .then((torrent) => torrent && updateCurrentSeeders(torrent))
+          .then((torrent) => torrent && parseTorrentFiles(torrent, undefined, kitsuId)
+              .then((files) => verifyFiles(files))
+              .then((files) => repository.createTorrent(torrent)
+                  .then(() => files.forEach(file => repository.createFile(file)))
+                  .then(() => console.log(`Created entry for ${torrent.title}`)))))))
+      .then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
+}
+
+function verifyFiles(files) {
+  if (files && files.length) {
+    return files;
+  }
+  throw new Error(`No video files found for: ${torrent.title}`);
+}
+
+async function checkIfExists(torrent) {
+  const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
+  if (!existingTorrent) {
+    return torrent; // no torrent exists yet
+  } else if (existingTorrent.provider === NAME) {
+    return undefined; // torrent by this provider already exists
+  }
+  return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
+}
+
+module.exports = { scrape };
--- a/scrapers/thepiratebay/thepiratebay_api.js
+++ b/scrapers/thepiratebay/thepiratebay_api.js
@@ -156,7 +156,8 @@ function parseBody(body) {

    $('table[id=\'searchResult\'] tr').each(function() {
      const name = $(this).find('.detLink').text();
-      if (!name || name === 'Do NOT download any torrent before hiding your IP with a VPN.') {
+      const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
+      if (!name || !sizeMatcher) {
        return;
      }
      torrents.push({
@@ -166,7 +167,7 @@ function parseBody(body) {
        magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
        category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10),
        subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10),
-        size: parseSize($(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/)[1])
+        size: parseSize(sizeMatcher[1])
      });
    });
    resolve(torrents);
--- a/scrapers/thepiratebay/thepiratebay_scraper.js
+++ b/scrapers/thepiratebay/thepiratebay_scraper.js
@@ -5,12 +5,12 @@ const { ungzip } = require('node-gzip');
 const LineByLineReader = require('line-by-line');
 const fs = require('fs');
 const { parse } = require('parse-torrent-title');
-const pirata = require('./api/thepiratebay');
+const thepiratebay = require('./thepiratebay_api.js');
 const bing = require('nodejs-bing');
-const { Type } = require('../lib/types');
-const repository = require('../lib/repository');
-const { getImdbId, escapeTitle } = require('../lib/metadata');
-const { parseTorrentFiles } = require('../lib/torrentFiles');
+const { Type } = require('../../lib/types');
+const repository = require('../../lib/repository');
+const { getImdbId, escapeTitle } = require('../../lib/metadata');
+const { parseTorrentFiles } = require('../../lib/torrentFiles');

 const NAME = 'ThePirateBay';
 const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
@@ -21,7 +21,7 @@ async function scrape() {
  const lastScraped = await repository.getProvider({ name: NAME });
  const lastDump = { updatedAt: 2147000000 };
  const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
-  //const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
+  //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);

  if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) {
    console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
@@ -83,16 +83,16 @@ async function scrape() {
  }
 }
 const allowedCategories = [
-  pirata.Categories.VIDEO.MOVIES,
-  pirata.Categories.VIDEO.MOVIES_HD,
-  pirata.Categories.VIDEO.MOVIES_DVDR,
-  pirata.Categories.VIDEO.MOVIES_3D,
-  pirata.Categories.VIDEO.TV_SHOWS,
-  pirata.Categories.VIDEO.TV_SHOWS_HD
+  thepiratebay.Categories.VIDEO.MOVIES,
+  thepiratebay.Categories.VIDEO.MOVIES_HD,
+  thepiratebay.Categories.VIDEO.MOVIES_DVDR,
+  thepiratebay.Categories.VIDEO.MOVIES_3D,
+  thepiratebay.Categories.VIDEO.TV_SHOWS,
+  thepiratebay.Categories.VIDEO.TV_SHOWS_HD
 ];
 const seriesCategories = [
-  pirata.Categories.VIDEO.TV_SHOWS,
-  pirata.Categories.VIDEO.TV_SHOWS_HD
+  thepiratebay.Categories.VIDEO.TV_SHOWS,
+  thepiratebay.Categories.VIDEO.TV_SHOWS_HD
 ];
 async function processTorrentRecord(record) {
  const alreadyExists = await repository.getSkipTorrent(record)
@@ -132,7 +132,7 @@ async function processTorrentRecord(record) {
    seeders: torrentFound.seeders,
  };

-  if (!imdbId) {
+  if (!imdbId && !titleInfo.complete) {
    console.log(`imdbId not found: ${torrentFound.name}`);
    repository.createFailedImdbTorrent(torrent);
    return;
@@ -145,8 +145,8 @@ async function processTorrentRecord(record) {
  }

  repository.createTorrent(torrent)
-      .then(() => files.forEach(file => repository.createFile(file)));
-  console.log(`Created entry for ${torrentFound.name}`);
+      .then(() => files.forEach(file => repository.createFile(file)))
+      .then(() => console.log(`Created entry for ${torrentFound.name}`));
 }

 async function findTorrent(record) {
@@ -158,7 +158,7 @@ async function findTorrentInSource(record) {
  let page = 0;
  let torrentFound;
  while (!torrentFound && page < 5) {
-    const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
+    const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
    torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0];
    page = torrents.length === 0 ? 1000 : page + 1;
  }
@@ -178,10 +178,19 @@ async function findTorrentViaBing(record) {
        }
        return result.link.match(/torrent\/(\w+)\//)[1];
      })
-      .then((torrentId) => pirata.torrent(torrentId))
+      .then((torrentId) => thepiratebay.torrent(torrentId))
 }

 function downloadDump(dump) {
+  try {
+    if (fs.existsSync(CSV_FILE_PATH)) {
+      console.log('dump file already exist...');
+      return;
+    }
+  } catch(err) {
+    console.error(err)
+  }
+
  console.log('downloading dump file...');
  return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' })
      .then((response) => response.body)