mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
adds horriblesubs scrapper
This commit is contained in:
@@ -83,4 +83,4 @@ async function getImdbId(info) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId };
|
module.exports = { escapeTitle, getMetadata, movieMetadata, seriesMetadata, getImdbId };
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ const Torrent = database.define('torrent', {
|
|||||||
title: { type: Sequelize.STRING(128), allowNull: false },
|
title: { type: Sequelize.STRING(128), allowNull: false },
|
||||||
type: { type: Sequelize.STRING(16), allowNull: false },
|
type: { type: Sequelize.STRING(16), allowNull: false },
|
||||||
imdbId: { type: Sequelize.STRING(12) },
|
imdbId: { type: Sequelize.STRING(12) },
|
||||||
|
kitsuId: { type: Sequelize.INTEGER },
|
||||||
uploadDate: { type: Sequelize.DATE, allowNull: false },
|
uploadDate: { type: Sequelize.DATE, allowNull: false },
|
||||||
seeders: { type: Sequelize.SMALLINT },
|
seeders: { type: Sequelize.SMALLINT },
|
||||||
files: { type: Sequelize.JSONB }
|
files: { type: Sequelize.JSONB }
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ const { parse } = require('parse-torrent-title');
|
|||||||
const pirata = require('./api/thepiratebay');
|
const pirata = require('./api/thepiratebay');
|
||||||
const { torrentFiles } = require('../lib/torrent');
|
const { torrentFiles } = require('../lib/torrent');
|
||||||
const repository = require('../lib/repository');
|
const repository = require('../lib/repository');
|
||||||
const { getImdbId } = require('../lib/metadata');
|
const { getImdbId, escapeTitle } = require('../lib/metadata');
|
||||||
|
|
||||||
const NAME = 'ThePirateBay';
|
const NAME = 'ThePirateBay';
|
||||||
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
||||||
@@ -16,21 +16,6 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
|||||||
const limiter = new Bottleneck({maxConcurrent: 40});
|
const limiter = new Bottleneck({maxConcurrent: 40});
|
||||||
|
|
||||||
async function scrape() {
|
async function scrape() {
|
||||||
const title = 'Я'
|
|
||||||
+ '+(2014)_1280x720-raroch.mp4'
|
|
||||||
.replace(/^"|"$/g, '')
|
|
||||||
.normalize('NFKD') // normalize non-ASCII characters
|
|
||||||
.replace(/[\u0300-\u036F]/g, '')
|
|
||||||
.replace(/&\w{2,6};/g, ' ')
|
|
||||||
.replace(/\s+/g, ' ')
|
|
||||||
.replace(/[\W\s]+/, ' ');
|
|
||||||
const titleInfo = parse(title);
|
|
||||||
const imdbId = await getImdbId({
|
|
||||||
name: titleInfo.title.toLowerCase(),
|
|
||||||
year: titleInfo.year
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
const lastScraped = await repository.getProvider({ name: NAME });
|
const lastScraped = await repository.getProvider({ name: NAME });
|
||||||
const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
||||||
|
|
||||||
@@ -49,11 +34,17 @@ async function scrape() {
|
|||||||
infoHash: Buffer.from(row[1], 'base64').toString('hex'),
|
infoHash: Buffer.from(row[1], 'base64').toString('hex'),
|
||||||
title: row[2]
|
title: row[2]
|
||||||
.replace(/^"|"$/g, '')
|
.replace(/^"|"$/g, '')
|
||||||
|
.replace(/&/g, '&')
|
||||||
.replace(/&\w{2,6};/g, ' ')
|
.replace(/&\w{2,6};/g, ' ')
|
||||||
.replace(/\s+/g, ' '),
|
.replace(/\s+/g, ' '),
|
||||||
size: parseInt(row[3], 10)
|
size: parseInt(row[3], 10)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) {
|
||||||
|
// torrent was already scraped previously, skipping
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!limiter.empty()) {
|
if (!limiter.empty()) {
|
||||||
lr.pause()
|
lr.pause()
|
||||||
}
|
}
|
||||||
@@ -68,7 +59,7 @@ async function scrape() {
|
|||||||
});
|
});
|
||||||
lr.on('end', () => {
|
lr.on('end', () => {
|
||||||
fs.unlink(CSV_FILE_PATH);
|
fs.unlink(CSV_FILE_PATH);
|
||||||
updateProvider({ name: NAME, lastScraped: lastDump.updatedAt.toDate() });
|
updateProvider({ name: NAME, lastScraped: lastDump.updatedAt });
|
||||||
console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
|
console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -116,14 +107,20 @@ async function processTorrentRecord(record) {
|
|||||||
console.log(`imdbId search: ${torrentFound.name}`);
|
console.log(`imdbId search: ${torrentFound.name}`);
|
||||||
const titleInfo = parse(torrentFound.name);
|
const titleInfo = parse(torrentFound.name);
|
||||||
const imdbId = await getImdbId({
|
const imdbId = await getImdbId({
|
||||||
name: titleInfo.title.toLowerCase(),
|
name: escapeTitle(titleInfo.title).toLowerCase(),
|
||||||
year: titleInfo.year,
|
year: titleInfo.year,
|
||||||
type: type
|
type: type
|
||||||
}).catch(() => undefined);
|
}).catch(() => undefined);
|
||||||
|
|
||||||
if (!imdbId) {
|
if (!imdbId) {
|
||||||
console.log(`imdbId not found: ${torrentFound.name}`);
|
console.log(`imdbId not found: ${torrentFound.name}`);
|
||||||
repository.createFailedImdbTorrent(record);
|
repository.updateTorrent({
|
||||||
|
infoHash: record.infoHash,
|
||||||
|
provider: NAME,
|
||||||
|
title: torrentFound.name,
|
||||||
|
uploadDate: record.uploadDate,
|
||||||
|
seeders: torrentFound.seeders,
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user