moves scraper to a subpackage

2020-03-10 15:12:18 +01:00
parent dff66d8fb2
commit 58aba322c2
30 changed files with 50 additions and 28 deletions
--- a/scraper/scrapers/kickass/kickass_dump_scraper.js
+++ b/scraper/scrapers/kickass/kickass_dump_scraper.js
@@ -0,0 +1,98 @@
+const moment = require('moment');
+const Bottleneck = require('bottleneck');
+const LineByLineReader = require('line-by-line');
+const fs = require('fs');
+const { Type } = require('../../lib/types');
+const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
+
+const NAME = 'KickassTorrents';
+const CSV_FILE_PATH = '/tmp/kickass.csv';
+
+const limiter = new Bottleneck({ maxConcurrent: 40 });
+
+async function scrape() {
+  console.log(`starting to scrape KAT dump: ${JSON.stringify(lastDump)}`);
+
+  let entriesProcessed = 0;
+  const lr = new LineByLineReader(CSV_FILE_PATH);
+  lr.on('line', (line) => {
+    if (entriesProcessed % 1000 === 0) {
+      console.log(`Processed ${entriesProcessed} entries`);
+    }
+    const row = line.match(/(?<=^|\|)(".*"|[^|]+)(?=\||$)/g);
+    if (row.length !== 11) {
+      console.log(`Invalid row: ${line}`);
+      return;
+    }
+    const torrent = {
+      infoHash: row[0].toLowerCase(),
+      title: row[1]
+          .replace(/^"|"$/g, '')
+          .replace(/&amp;/g, '&')
+          .replace(/&\w{2,6};/g, ' ')
+          .replace(/\s+/g, ' ')
+          .trim(),
+      category: row[2],
+      size: parseInt(row[5], 10),
+      seeders: parseInt(row[8], 10),
+      uploadDate: moment.unix(parseInt(row[10], 10)).toDate(),
+    };
+
+    if (!limiter.empty()) {
+      lr.pause()
+    }
+
+    limiter.schedule(() => processTorrentRecord(torrent)
+        .catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
+        .then(() => limiter.empty())
+        .then((empty) => empty && lr.resume())
+        .then(() => entriesProcessed++);
+  });
+  lr.on('error', (err) => {
+    console.log(err);
+  });
+  lr.on('end', () => {
+    fs.unlink(CSV_FILE_PATH);
+    console.log(`finished to scrape KAT dump: ${JSON.stringify(lastDump)}!`);
+  });
+}
+
+const categoryMapping = {
+  "Movies": Type.MOVIE,
+  "TV": Type.SERIES,
+  "Anime": Type.ANIME
+};
+
+async function processTorrentRecord(record) {
+  if (!categoryMapping[record.category] || record.seeders === 0) {
+    return createSkipTorrentEntry(record);
+  }
+  if (await getStoredTorrentEntry(record)) {
+    return;
+  }
+
+  const torrentFound = await findTorrent(record).catch(() => undefined);
+
+  if (!torrentFound) {
+    return createSkipTorrentEntry(record);
+  }
+
+  const torrent = {
+    infoHash: record.infoHash,
+    provider: NAME,
+    title: torrentFound.name,
+    size: record.size,
+    type: categoryMapping[record.category],
+    imdbId: torrentFound.imdbId,
+    uploadDate: record.uploadDate,
+    seeders: torrentFound.seeders,
+  };
+
+  return createTorrentEntry(torrent);
+}
+
+async function findTorrent(record) {
+  return Promise.reject("not found");
+}
+
+module.exports = { scrape };