adds horriblesubs scrapper

This commit is contained in:
TheBeastLT
2019-03-15 17:17:28 +01:00
parent 88bddcff54
commit 7925f18064
3 changed files with 18 additions and 20 deletions

View File

@@ -83,4 +83,4 @@ async function getImdbId(info) {
}); });
} }
module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId }; module.exports = { escapeTitle, getMetadata, movieMetadata, seriesMetadata, getImdbId };

View File

@@ -15,6 +15,7 @@ const Torrent = database.define('torrent', {
title: { type: Sequelize.STRING(128), allowNull: false }, title: { type: Sequelize.STRING(128), allowNull: false },
type: { type: Sequelize.STRING(16), allowNull: false }, type: { type: Sequelize.STRING(16), allowNull: false },
imdbId: { type: Sequelize.STRING(12) }, imdbId: { type: Sequelize.STRING(12) },
kitsuId: { type: Sequelize.INTEGER },
uploadDate: { type: Sequelize.DATE, allowNull: false }, uploadDate: { type: Sequelize.DATE, allowNull: false },
seeders: { type: Sequelize.SMALLINT }, seeders: { type: Sequelize.SMALLINT },
files: { type: Sequelize.JSONB } files: { type: Sequelize.JSONB }

View File

@@ -8,7 +8,7 @@ const { parse } = require('parse-torrent-title');
const pirata = require('./api/thepiratebay'); const pirata = require('./api/thepiratebay');
const { torrentFiles } = require('../lib/torrent'); const { torrentFiles } = require('../lib/torrent');
const repository = require('../lib/repository'); const repository = require('../lib/repository');
const { getImdbId } = require('../lib/metadata'); const { getImdbId, escapeTitle } = require('../lib/metadata');
const NAME = 'ThePirateBay'; const NAME = 'ThePirateBay';
const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
@@ -16,21 +16,6 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
const limiter = new Bottleneck({maxConcurrent: 40}); const limiter = new Bottleneck({maxConcurrent: 40});
async function scrape() { async function scrape() {
const title = 'Я'
+ '+(2014)_1280x720-raroch.mp4'
.replace(/^"|"$/g, '')
.normalize('NFKD') // normalize non-ASCII characters
.replace(/[\u0300-\u036F]/g, '')
.replace(/&\w{2,6};/g, ' ')
.replace(/\s+/g, ' ')
.replace(/[\W\s]+/, ' ');
const titleInfo = parse(title);
const imdbId = await getImdbId({
name: titleInfo.title.toLowerCase(),
year: titleInfo.year
});
const lastScraped = await repository.getProvider({ name: NAME }); const lastScraped = await repository.getProvider({ name: NAME });
const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
@@ -49,11 +34,17 @@ async function scrape() {
infoHash: Buffer.from(row[1], 'base64').toString('hex'), infoHash: Buffer.from(row[1], 'base64').toString('hex'),
title: row[2] title: row[2]
.replace(/^"|"$/g, '') .replace(/^"|"$/g, '')
.replace(/&/g, '&')
.replace(/&\w{2,6};/g, ' ') .replace(/&\w{2,6};/g, ' ')
.replace(/\s+/g, ' '), .replace(/\s+/g, ' '),
size: parseInt(row[3], 10) size: parseInt(row[3], 10)
}; };
if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) {
// torrent was already scraped previously, skipping
return;
}
if (!limiter.empty()) { if (!limiter.empty()) {
lr.pause() lr.pause()
} }
@@ -68,7 +59,7 @@ async function scrape() {
}); });
lr.on('end', () => { lr.on('end', () => {
fs.unlink(CSV_FILE_PATH); fs.unlink(CSV_FILE_PATH);
updateProvider({ name: NAME, lastScraped: lastDump.updatedAt.toDate() }); updateProvider({ name: NAME, lastScraped: lastDump.updatedAt });
console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`); console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
}); });
} }
@@ -116,14 +107,20 @@ async function processTorrentRecord(record) {
console.log(`imdbId search: ${torrentFound.name}`); console.log(`imdbId search: ${torrentFound.name}`);
const titleInfo = parse(torrentFound.name); const titleInfo = parse(torrentFound.name);
const imdbId = await getImdbId({ const imdbId = await getImdbId({
name: titleInfo.title.toLowerCase(), name: escapeTitle(titleInfo.title).toLowerCase(),
year: titleInfo.year, year: titleInfo.year,
type: type type: type
}).catch(() => undefined); }).catch(() => undefined);
if (!imdbId) { if (!imdbId) {
console.log(`imdbId not found: ${torrentFound.name}`); console.log(`imdbId not found: ${torrentFound.name}`);
repository.createFailedImdbTorrent(record); repository.updateTorrent({
infoHash: record.infoHash,
provider: NAME,
title: torrentFound.name,
uploadDate: record.uploadDate,
seeders: torrentFound.seeders,
});
return; return;
} }