[scraper] updates scrapers and unique index

2024-12-20 03:29:51 +00:00 · 2020-03-14 22:25:51 +01:00
parent 1efaa0451c
commit 326a07b82e
13 changed files with 110 additions and 57 deletions
--- a/scraper/index.js
+++ b/scraper/index.js
@@ -14,7 +14,7 @@ const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepi

 const PROVIDERS = [
  // horribleSubsScraper,
-  rarbgScraper,
+  // rarbgScraper,
  thepiratebayScraper,
  kickassScraper,
  leetxScraper
@@ -42,7 +42,7 @@ function enableScheduling() {
 }

 server.get('/', function (req, res) {
-  res.send(200);
+  res.sendStatus(200);
 });

 server.listen(process.env.PORT || 7000, async () => {
--- a/scraper/lib/metadata.js
+++ b/scraper/lib/metadata.js
@@ -96,10 +96,11 @@ async function getImdbId(info, type) {
        });
      }).catch(() => bing.web(`${name} ${year || ''} ${type} imdb`)
          .then(results => results
-              .map((result) => result.link)
+              .map(result => result.link)
              .find(result => result.includes('imdb.com/title/')))
          .then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/))
-          .then(match => match && match[1])));
+          .then(match => match && match[1])))
+      .then(imdbId => 'tt' + imdbId.replace(/tt0*([1-9][0-9]*)$/, '$1').padStart(7, '0'));
 }

 async function getKitsuId(info) {
--- a/scraper/lib/repository.js
+++ b/scraper/lib/repository.js
@@ -1,9 +1,14 @@
-const { Sequelize } = require('sequelize');
+const { Sequelize, fn, col } = require('sequelize');
 const Op = Sequelize.Op;

 const DATABASE_URI = process.env.DATABASE_URI;

-const database = new Sequelize(DATABASE_URI, { logging: false });
+const database = new Sequelize(
+    DATABASE_URI,
+    {
+      logging: false
+    }
+);

 const Provider = database.define('provider', {
  name: { type: Sequelize.STRING(32), primaryKey: true },
@@ -47,6 +52,19 @@ const File = database.define('file',
    },
    {
      indexes: [
+        {
+          unique: true,
+          name: 'files_unique_file_constraint',
+          fields: [
+            col('infoHash'),
+            fn('COALESCE', (col('fileIndex')), -1),
+            fn('COALESCE', (col('imdbId')), 'null'),
+            fn('COALESCE', (col('imdbSeason')), -1),
+            fn('COALESCE', (col('imdbEpisode')), -1),
+            fn('COALESCE', (col('kitsuId')), -1),
+            fn('COALESCE', (col('kitsuEpisode')), -1)
+          ]
+        },
        { unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] },
        { unique: false, fields: ['kitsuId', 'kitsuEpisode'] }
      ]
@@ -63,7 +81,11 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', {
 });

 function connect() {
-  return database.sync({ alter: true });
+  return database.sync({ alter: true })
+      .catch(error => {
+        console.error('Failed syncing database: ', error);
+        throw error;
+      });
 }

 function getProvider(provider) {
--- a/scraper/lib/torrent.js
+++ b/scraper/lib/torrent.js
@@ -111,7 +111,7 @@ async function filesFromTorrentStream(torrent) {
 function filterVideos(files) {
  return files.filter((file) => {
    const match = file.path.match(/\.(\w{2,4})$/);
-    return match && EXTENSIONS.includes(match[1]);
+    return match && EXTENSIONS.includes(match[1].toLowerCase());
  });
 }

--- a/scraper/lib/torrentFiles.js
+++ b/scraper/lib/torrentFiles.js
@@ -62,8 +62,9 @@ async function parseTorrentFiles(torrent) {
 }

 async function getSeriesFiles(torrent, parsedTorrentName) {
-  if ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) ||
-      (!parsedTorrentName.episodes && parsedTorrentName.date)) {
+  if (!parsedTorrentName.complete && !parsedTorrentName.hasMovies &&
+      ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) ||
+          (!parsedTorrentName.episodes && parsedTorrentName.date))) {
    return [{
      name: torrent.title,
      path: torrent.title,
--- a/scraper/manual/manual.js
+++ b/scraper/manual/manual.js
@@ -2,6 +2,7 @@ require('dotenv').config();
 const Bottleneck = require('bottleneck');
 const { parse } = require('parse-torrent-title');
 const repository = require('../lib/repository');
+const { getImdbId } = require('../lib/metadata');
 const { parseTorrentFiles } = require('../lib/torrentFiles');
 const { Type } = require('../lib/types');

@@ -15,7 +16,7 @@ async function addMissingEpisodes() {
  const imdbId = Object.values(storedFiles)[0].imdbId;

  torrentFiles
-      .filter((file) => !storedFiles[file.fileIndex])
+      .filter((file) => !storedFiles[file.fileIndex !== undefined ? file.fileIndex : null])
      .map((file) => ({
        infoHash: torrent.infoHash,
        fileIndex: file.fileIndex,
@@ -63,11 +64,22 @@ async function reapplySeriesSeasonsSavedAsMovies() {
      .then(() => console.log('Finished updating multiple torrents'));
 }

+async function reapplyDecomposingToTorrentsOnRegex(regex) {
+  return repository.getTorrentsBasedOnTitle(regex, Type.ANIME)
+      .then(torrents => Promise.all(torrents
+          .map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, true)))))
+      .then(() => console.log('Finished updating multiple torrents'));
+}
+
 async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
  const torrent = await repository.getTorrent({ infoHash });
  const storedFiles = await repository.getFiles({ infoHash });
  const fileIndexMap = storedFiles
-      .reduce((map, next) => (map[next.fileIndex] = (map[next.fileIndex] || []).concat(next), map), {});
+      .reduce((map, next) => {
+        const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
+        map[fileIndex] = (map[fileIndex] || []).concat(next);
+        return map;
+      }, {});
  const files = includeSourceFiles && Object.values(fileIndexMap)
      .map(sameIndexFiles => sameIndexFiles[0])
      .map(file => ({
@@ -76,12 +88,14 @@ async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
        path: file.title,
        size: file.size
      }));
-  const imdbId = storedFiles[0].imdbId;
+  const imdbId = storedFiles.length && storedFiles[0].imdbId || await getImdbId(parse(torrent.title));

  return parseTorrentFiles({ ...torrent, imdbId, files })
      .then(newFiles => newFiles.map(file => {
-        if (fileIndexMap[file.fileIndex]) {
-          const originalFile = fileIndexMap[file.fileIndex].shift();
+        const fileIndex = file.fileIndex !== undefined ? file.fileIndex : null;
+        const mapping = fileIndexMap[fileIndex];
+        if (mapping) {
+          const originalFile = mapping.shift();
          if (originalFile) {
            if (!originalFile.imdbId) {
              originalFile.imdbId = file.imdbId
@@ -176,5 +190,7 @@ async function findAllFiles() {
 //addMissingEpisodes().then(() => console.log('Finished'));
 //findAllFiles().then(() => console.log('Finished'));
 //updateMovieCollections().then(() => console.log('Finished'));
-reapplyEpisodeDecomposing('d71c4fd1cb9bb9c5365a570b903a3a58774f61a5', true).then(() => console.log('Finished'));
-//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished'));
+reapplyEpisodeDecomposing('aec7bcac457ad68924e7119f859cf6fa3878f9f5', false).then(() => console.log('Finished'));
+//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished'));
+// reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished'));
+//reapplyManualHashes().then(() => console.log('Finished'));
--- a/scraper/scrapers/1337x/1337x_scraper.js
+++ b/scraper/scrapers/1337x/1337x_scraper.js
@@ -20,7 +20,7 @@ async function scrape() {
  return scrapeLatestTorrents()
      .then(() => {
        lastScrape.lastScraped = scrapeStart;
-        return repository.updateProvider(lastScrape);
+        return lastScrape.save();
      })
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }
@@ -40,14 +40,15 @@ async function scrapeLatestTorrents() {
 async function scrapeLatestTorrentsForCategory(category, page = 1) {
  console.log(`Scrapping ${NAME} ${category} category page ${page}`);
  return leetx.browse(({ category, page }))
+      .catch(error => {
+        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
+        return Promise.resolve([]);
+      })
      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
      .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
          ? scrapeLatestTorrentsForCategory(category, page + 1)
-          : Promise.resolve())
-      .catch(error => {
-        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
-        return Promise.resolve();
-      });
+          : Promise.resolve());
+
 }

 async function processTorrentRecord(record) {
--- a/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
+++ b/scraper/scrapers/horriblesubs/horriblesubs_scraper.js
@@ -11,7 +11,7 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata');
 const showMappings = require('./horriblesubs_mapping.json');

 const NAME = 'HorribleSubs';
-const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days;
+const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;

 const limiter = new Bottleneck({ maxConcurrent: 5 });
 const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
@@ -26,7 +26,7 @@ async function scrape() {
    return _scrapeAllShows()
        .then(() => {
          lastScrape.lastScraped = scrapeStart;
-          return repository.updateProvider(lastScrape);
+          return lastScrape.save();
        })
        .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
  } else {
@@ -140,7 +140,7 @@ async function _parseShowData(showData) {
    return kitsuId;
  };

-  return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
+  return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
      .map((episodeInfo) => episodeInfo.mirrors
          .filter((mirror) => mirror.magnetLink && mirror.magnetLink.length)
          .map((mirror) => ({
@@ -169,15 +169,23 @@ async function _parseShowData(showData) {
 async function verifyFiles(torrent, files) {
  if (files && files.length) {
    const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
-        .then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {}))
+        .then((existing) => existing
+            .reduce((map, next) => {
+              const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
+              map[fileIndex] = (map[fileIndex] || []).concat(next);
+              return map;
+            }, {}))
        .catch(() => undefined);
    if (existingFiles && Object.keys(existingFiles).length) {
      return files
-          .map(file => ({
-            ...file,
-            id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id,
-            size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size
-          }))
+          .map(file => {
+            const mapping = existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
+            if (mapping) {
+              const originalFile = mapping.shift();
+              return { ...file, id: originalFile.id, size: originalFile.size || file.size };
+            }
+            return file;
+          })
    }
    return files;
  }
--- a/scraper/scrapers/kickass/kickass_scraper.js
+++ b/scraper/scrapers/kickass/kickass_scraper.js
@@ -20,7 +20,7 @@ async function scrape() {
  return scrapeLatestTorrents()
      .then(() => {
        lastScrape.lastScraped = scrapeStart;
-        return repository.updateProvider(lastScrape);
+        return lastScrape.save();
      })
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }
@@ -39,14 +39,14 @@ async function scrapeLatestTorrents() {
 async function scrapeLatestTorrentsForCategory(category, page = 1) {
  console.log(`Scrapping ${NAME} ${category} category page ${page}`);
  return kickass.browse(({ category, page }))
+      .catch(error => {
+        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
+        return Promise.resolve([]);
+      })
      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
      .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
          ? scrapeLatestTorrentsForCategory(category, page + 1)
-          : Promise.resolve())
-      .catch(error => {
-        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
-        return Promise.resolve();
-      });
+          : Promise.resolve());
 }

 async function processTorrentRecord(record) {
--- a/scraper/scrapers/rarbg/rarbg_dump_scraper.js
+++ b/scraper/scrapers/rarbg/rarbg_dump_scraper.js
@@ -16,12 +16,13 @@ const entryLimiter = new Bottleneck({ maxConcurrent: 40 });

 async function scrape() {
  console.log(`[${moment()}] starting ${NAME} dump scrape...`);
-  const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json');
-  const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json');
-  const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
+  //const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json');
+  const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json').slice(800);
+  //const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);

-  return Promise.all(allImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId)
-      .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))))))
+  return Promise.all(
+      seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId))
+          .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))
      .then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`));
 }

--- a/scraper/scrapers/rarbg/rarbg_scraper.js
+++ b/scraper/scrapers/rarbg/rarbg_scraper.js
@@ -20,7 +20,7 @@ async function scrape() {
  return scrapeLatestTorrents()
      .then(() => {
        lastScrape.lastScraped = scrapeStart;
-        return repository.updateProvider(lastScrape);
+        return lastScrape.save();
      })
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }
@@ -64,8 +64,8 @@ async function scrapeLatestTorrentsForCategory(category) {
      })))
      .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
      .catch(error => {
-        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
-        return Promise.resolve();
+        console.warn(`Failed ${NAME} scrapping for ${category} due: `, error);
+        return Promise.resolve([]);
      });
 }

--- a/scraper/scrapers/thepiratebay/thepiratebay_api.js
+++ b/scraper/scrapers/thepiratebay/thepiratebay_api.js
@@ -6,9 +6,9 @@ const Promises = require('../../lib/promises');

 const defaultProxies = [
  'https://thepiratebay.org',
-  'https://piratebays.icu',
-  'https://piratebays.cool',
-  'https://piratebays.life'];
+  'https://proxybay.pro',
+  'https://ukpiratebayproxy.com',
+  'https://thepiratebayproxy.info'];
 const dumpUrl = '/static/dump/csv/';
 const defaultTimeout = 10000;

@@ -169,15 +169,18 @@ function parseBody(body) {
    $('table[id=\'searchResult\'] tr').each(function () {
      const name = $(this).find('.detLink').text();
      const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
+      const magnetLink = $(this).find('a[title=\'Download this torrent using magnet\']').attr('href');
      if (!name || !sizeMatcher) {
        return;
      }
      torrents.push({
-        torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
        name: name,
+        magnetLink: magnetLink,
+        infoHash: decode(magnetLink).infoHash,
+        torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
        seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10),
        leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10),
-        magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
+
        category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0],
            10),
        subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0],
--- a/scraper/scrapers/thepiratebay/thepiratebay_scraper.js
+++ b/scraper/scrapers/thepiratebay/thepiratebay_scraper.js
@@ -32,7 +32,7 @@ async function scrape() {
  return scrapeLatestTorrents()
      .then(() => {
        lastScrape.lastScraped = scrapeStart;
-        return repository.updateProvider(lastScrape);
+        return lastScrape.save();
      })
      .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
 }
@@ -45,14 +45,14 @@ async function scrapeLatestTorrents() {
 async function scrapeLatestTorrentsForCategory(category, page = 1) {
  console.log(`Scrapping ${NAME} ${category} category page ${page}`);
  return thepiratebay.browse(({ category, page }))
+      .catch(error => {
+        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
+        return Promise.resolve([]);
+      })
      .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
      .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
          ? scrapeLatestTorrentsForCategory(category, page + 1)
-          : Promise.resolve())
-      .catch(error => {
-        console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
-        return Promise.resolve();
-      });
+          : Promise.resolve());
 }

 async function processTorrentRecord(record) {