From 7925f18064820e77fa11e3386ae6cf6b4e4fbb6f Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Fri, 15 Mar 2019 17:17:28 +0100 Subject: [PATCH] adds horriblesubs scrapper --- lib/metadata.js | 2 +- lib/repository.js | 1 + scrapers/piratebay_dump.js | 35 ++++++++++++++++------------------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/lib/metadata.js b/lib/metadata.js index 939ddda..77721ab 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -83,4 +83,4 @@ async function getImdbId(info) { }); } -module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId }; +module.exports = { escapeTitle, getMetadata, movieMetadata, seriesMetadata, getImdbId }; diff --git a/lib/repository.js b/lib/repository.js index 199582a..d10dd4c 100644 --- a/lib/repository.js +++ b/lib/repository.js @@ -15,6 +15,7 @@ const Torrent = database.define('torrent', { title: { type: Sequelize.STRING(128), allowNull: false }, type: { type: Sequelize.STRING(16), allowNull: false }, imdbId: { type: Sequelize.STRING(12) }, + kitsuId: { type: Sequelize.INTEGER }, uploadDate: { type: Sequelize.DATE, allowNull: false }, seeders: { type: Sequelize.SMALLINT }, files: { type: Sequelize.JSONB } diff --git a/scrapers/piratebay_dump.js b/scrapers/piratebay_dump.js index f8bda6c..03461c9 100644 --- a/scrapers/piratebay_dump.js +++ b/scrapers/piratebay_dump.js @@ -8,7 +8,7 @@ const { parse } = require('parse-torrent-title'); const pirata = require('./api/thepiratebay'); const { torrentFiles } = require('../lib/torrent'); const repository = require('../lib/repository'); -const { getImdbId } = require('../lib/metadata'); +const { getImdbId, escapeTitle } = require('../lib/metadata'); const NAME = 'ThePirateBay'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; @@ -16,21 +16,6 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; const limiter = new Bottleneck({maxConcurrent: 40}); async function scrape() { - const title = 'Я' - + '+(2014)_1280x720-raroch.mp4' - .replace(/^"|"$/g, '') - .normalize('NFKD') // normalize non-ASCII characters - .replace(/[\u0300-\u036F]/g, '') - .replace(/&\w{2,6};/g, ' ') - .replace(/\s+/g, ' ') - .replace(/[\W\s]+/, ' '); - const titleInfo = parse(title); - const imdbId = await getImdbId({ - name: titleInfo.title.toLowerCase(), - year: titleInfo.year - }); - - const lastScraped = await repository.getProvider({ name: NAME }); const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); @@ -49,11 +34,17 @@ async function scrape() { infoHash: Buffer.from(row[1], 'base64').toString('hex'), title: row[2] .replace(/^"|"$/g, '') + .replace(/&/g, '&') .replace(/&\w{2,6};/g, ' ') .replace(/\s+/g, ' '), size: parseInt(row[3], 10) }; + if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) { + // torrent was already scraped previously, skipping + return; + } + if (!limiter.empty()) { lr.pause() } @@ -68,7 +59,7 @@ async function scrape() { }); lr.on('end', () => { fs.unlink(CSV_FILE_PATH); - updateProvider({ name: NAME, lastScraped: lastDump.updatedAt.toDate() }); + updateProvider({ name: NAME, lastScraped: lastDump.updatedAt }); console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`); }); } @@ -116,14 +107,20 @@ async function processTorrentRecord(record) { console.log(`imdbId search: ${torrentFound.name}`); const titleInfo = parse(torrentFound.name); const imdbId = await getImdbId({ - name: titleInfo.title.toLowerCase(), + name: escapeTitle(titleInfo.title).toLowerCase(), year: titleInfo.year, type: type }).catch(() => undefined); if (!imdbId) { console.log(`imdbId not found: ${torrentFound.name}`); - repository.createFailedImdbTorrent(record); + repository.updateTorrent({ + infoHash: record.infoHash, + provider: NAME, + title: torrentFound.name, + uploadDate: record.uploadDate, + seeders: torrentFound.seeders, + }); return; }