From 7925f18064820e77fa11e3386ae6cf6b4e4fbb6f Mon Sep 17 00:00:00 2001
From: TheBeastLT <pauliox@beyond.lt>
Date: Fri, 15 Mar 2019 17:17:28 +0100
Subject: [PATCH] adds horriblesubs scrapper

---
 lib/metadata.js            |  2 +-
 lib/repository.js          |  1 +
 scrapers/piratebay_dump.js | 35 ++++++++++++++++-------------------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/lib/metadata.js b/lib/metadata.js
index 939ddda..77721ab 100644
--- a/lib/metadata.js
+++ b/lib/metadata.js
@@ -83,4 +83,4 @@ async function getImdbId(info) {
   });
 }
 
-module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId };
+module.exports = { escapeTitle, getMetadata, movieMetadata, seriesMetadata, getImdbId };
diff --git a/lib/repository.js b/lib/repository.js
index 199582a..d10dd4c 100644
--- a/lib/repository.js
+++ b/lib/repository.js
@@ -15,6 +15,7 @@ const Torrent = database.define('torrent', {
   title: { type: Sequelize.STRING(128), allowNull: false },
   type: { type: Sequelize.STRING(16), allowNull: false },
   imdbId: { type: Sequelize.STRING(12) },
+  kitsuId: { type: Sequelize.INTEGER },
   uploadDate: { type: Sequelize.DATE, allowNull: false },
   seeders: { type: Sequelize.SMALLINT },
   files: { type: Sequelize.JSONB }
diff --git a/scrapers/piratebay_dump.js b/scrapers/piratebay_dump.js
index f8bda6c..03461c9 100644
--- a/scrapers/piratebay_dump.js
+++ b/scrapers/piratebay_dump.js
@@ -8,7 +8,7 @@ const { parse } = require('parse-torrent-title');
 const pirata = require('./api/thepiratebay');
 const { torrentFiles } = require('../lib/torrent');
 const repository = require('../lib/repository');
-const { getImdbId } = require('../lib/metadata');
+const { getImdbId, escapeTitle } = require('../lib/metadata');
 
 const NAME = 'ThePirateBay';
 const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
@@ -16,21 +16,6 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
 const limiter = new Bottleneck({maxConcurrent: 40});
 
 async function scrape() {
-  const title = 'Я'
-      + '+(2014)_1280x720-raroch.mp4'
-      .replace(/^"|"$/g, '')
-      .normalize('NFKD') // normalize non-ASCII characters
-      .replace(/[\u0300-\u036F]/g, '')
-      .replace(/&\w{2,6};/g, ' ')
-      .replace(/\s+/g, ' ')
-      .replace(/[\W\s]+/, ' ');
-  const titleInfo = parse(title);
-  const imdbId = await getImdbId({
-    name: titleInfo.title.toLowerCase(),
-    year: titleInfo.year
-  });
-
-
   const lastScraped = await repository.getProvider({ name: NAME });
   const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
 
@@ -49,11 +34,17 @@ async function scrape() {
         infoHash: Buffer.from(row[1], 'base64').toString('hex'),
         title: row[2]
             .replace(/^"|"$/g, '')
+            .replace(/&amp;/g, '&')
             .replace(/&\w{2,6};/g, ' ')
             .replace(/\s+/g, ' '),
         size: parseInt(row[3], 10)
       };
 
+      if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) {
+        // torrent was already scraped previously, skipping
+        return;
+      }
+
       if (!limiter.empty()) {
         lr.pause()
       }
@@ -68,7 +59,7 @@ async function scrape() {
     });
     lr.on('end', () => {
         fs.unlink(CSV_FILE_PATH);
-      updateProvider({ name: NAME, lastScraped: lastDump.updatedAt.toDate() });
+        updateProvider({ name: NAME, lastScraped: lastDump.updatedAt });
         console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
     });
   }
@@ -116,14 +107,20 @@ async function processTorrentRecord(record) {
   console.log(`imdbId search: ${torrentFound.name}`);
   const titleInfo = parse(torrentFound.name);
   const imdbId = await getImdbId({
-    name: titleInfo.title.toLowerCase(),
+    name: escapeTitle(titleInfo.title).toLowerCase(),
     year: titleInfo.year,
     type: type
   }).catch(() => undefined);
 
   if (!imdbId) {
     console.log(`imdbId not found: ${torrentFound.name}`);
-    repository.createFailedImdbTorrent(record);
+    repository.updateTorrent({
+      infoHash: record.infoHash,
+      provider: NAME,
+      title: torrentFound.name,
+      uploadDate: record.uploadDate,
+      seeders: torrentFound.seeders,
+    });
     return;
   }