[scraper] updates scrapers and unique index

This commit is contained in:
TheBeastLT
2020-03-14 22:25:51 +01:00
parent 1efaa0451c
commit 326a07b82e
13 changed files with 110 additions and 57 deletions

View File

@@ -14,7 +14,7 @@ const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepi
const PROVIDERS = [ const PROVIDERS = [
// horribleSubsScraper, // horribleSubsScraper,
rarbgScraper, // rarbgScraper,
thepiratebayScraper, thepiratebayScraper,
kickassScraper, kickassScraper,
leetxScraper leetxScraper
@@ -42,7 +42,7 @@ function enableScheduling() {
} }
server.get('/', function (req, res) { server.get('/', function (req, res) {
res.send(200); res.sendStatus(200);
}); });
server.listen(process.env.PORT || 7000, async () => { server.listen(process.env.PORT || 7000, async () => {

View File

@@ -96,10 +96,11 @@ async function getImdbId(info, type) {
}); });
}).catch(() => bing.web(`${name} ${year || ''} ${type} imdb`) }).catch(() => bing.web(`${name} ${year || ''} ${type} imdb`)
.then(results => results .then(results => results
.map((result) => result.link) .map(result => result.link)
.find(result => result.includes('imdb.com/title/'))) .find(result => result.includes('imdb.com/title/')))
.then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/)) .then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/))
.then(match => match && match[1]))); .then(match => match && match[1])))
.then(imdbId => 'tt' + imdbId.replace(/tt0*([1-9][0-9]*)$/, '$1').padStart(7, '0'));
} }
async function getKitsuId(info) { async function getKitsuId(info) {

View File

@@ -1,9 +1,14 @@
const { Sequelize } = require('sequelize'); const { Sequelize, fn, col } = require('sequelize');
const Op = Sequelize.Op; const Op = Sequelize.Op;
const DATABASE_URI = process.env.DATABASE_URI; const DATABASE_URI = process.env.DATABASE_URI;
const database = new Sequelize(DATABASE_URI, { logging: false }); const database = new Sequelize(
DATABASE_URI,
{
logging: false
}
);
const Provider = database.define('provider', { const Provider = database.define('provider', {
name: { type: Sequelize.STRING(32), primaryKey: true }, name: { type: Sequelize.STRING(32), primaryKey: true },
@@ -47,6 +52,19 @@ const File = database.define('file',
}, },
{ {
indexes: [ indexes: [
{
unique: true,
name: 'files_unique_file_constraint',
fields: [
col('infoHash'),
fn('COALESCE', (col('fileIndex')), -1),
fn('COALESCE', (col('imdbId')), 'null'),
fn('COALESCE', (col('imdbSeason')), -1),
fn('COALESCE', (col('imdbEpisode')), -1),
fn('COALESCE', (col('kitsuId')), -1),
fn('COALESCE', (col('kitsuEpisode')), -1)
]
},
{ unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] }, { unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] },
{ unique: false, fields: ['kitsuId', 'kitsuEpisode'] } { unique: false, fields: ['kitsuId', 'kitsuEpisode'] }
] ]
@@ -63,7 +81,11 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', {
}); });
function connect() { function connect() {
return database.sync({ alter: true }); return database.sync({ alter: true })
.catch(error => {
console.error('Failed syncing database: ', error);
throw error;
});
} }
function getProvider(provider) { function getProvider(provider) {

View File

@@ -111,7 +111,7 @@ async function filesFromTorrentStream(torrent) {
function filterVideos(files) { function filterVideos(files) {
return files.filter((file) => { return files.filter((file) => {
const match = file.path.match(/\.(\w{2,4})$/); const match = file.path.match(/\.(\w{2,4})$/);
return match && EXTENSIONS.includes(match[1]); return match && EXTENSIONS.includes(match[1].toLowerCase());
}); });
} }

View File

@@ -62,8 +62,9 @@ async function parseTorrentFiles(torrent) {
} }
async function getSeriesFiles(torrent, parsedTorrentName) { async function getSeriesFiles(torrent, parsedTorrentName) {
if ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) || if (!parsedTorrentName.complete && !parsedTorrentName.hasMovies &&
(!parsedTorrentName.episodes && parsedTorrentName.date)) { ((parsedTorrentName.episode && (!parsedTorrentName.seasons || parsedTorrentName.seasons.length <= 1)) ||
(!parsedTorrentName.episodes && parsedTorrentName.date))) {
return [{ return [{
name: torrent.title, name: torrent.title,
path: torrent.title, path: torrent.title,

View File

@@ -2,6 +2,7 @@ require('dotenv').config();
const Bottleneck = require('bottleneck'); const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title'); const { parse } = require('parse-torrent-title');
const repository = require('../lib/repository'); const repository = require('../lib/repository');
const { getImdbId } = require('../lib/metadata');
const { parseTorrentFiles } = require('../lib/torrentFiles'); const { parseTorrentFiles } = require('../lib/torrentFiles');
const { Type } = require('../lib/types'); const { Type } = require('../lib/types');
@@ -15,7 +16,7 @@ async function addMissingEpisodes() {
const imdbId = Object.values(storedFiles)[0].imdbId; const imdbId = Object.values(storedFiles)[0].imdbId;
torrentFiles torrentFiles
.filter((file) => !storedFiles[file.fileIndex]) .filter((file) => !storedFiles[file.fileIndex !== undefined ? file.fileIndex : null])
.map((file) => ({ .map((file) => ({
infoHash: torrent.infoHash, infoHash: torrent.infoHash,
fileIndex: file.fileIndex, fileIndex: file.fileIndex,
@@ -63,11 +64,22 @@ async function reapplySeriesSeasonsSavedAsMovies() {
.then(() => console.log('Finished updating multiple torrents')); .then(() => console.log('Finished updating multiple torrents'));
} }
async function reapplyDecomposingToTorrentsOnRegex(regex) {
return repository.getTorrentsBasedOnTitle(regex, Type.ANIME)
.then(torrents => Promise.all(torrents
.map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, true)))))
.then(() => console.log('Finished updating multiple torrents'));
}
async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) { async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
const torrent = await repository.getTorrent({ infoHash }); const torrent = await repository.getTorrent({ infoHash });
const storedFiles = await repository.getFiles({ infoHash }); const storedFiles = await repository.getFiles({ infoHash });
const fileIndexMap = storedFiles const fileIndexMap = storedFiles
.reduce((map, next) => (map[next.fileIndex] = (map[next.fileIndex] || []).concat(next), map), {}); .reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
map[fileIndex] = (map[fileIndex] || []).concat(next);
return map;
}, {});
const files = includeSourceFiles && Object.values(fileIndexMap) const files = includeSourceFiles && Object.values(fileIndexMap)
.map(sameIndexFiles => sameIndexFiles[0]) .map(sameIndexFiles => sameIndexFiles[0])
.map(file => ({ .map(file => ({
@@ -76,12 +88,14 @@ async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
path: file.title, path: file.title,
size: file.size size: file.size
})); }));
const imdbId = storedFiles[0].imdbId; const imdbId = storedFiles.length && storedFiles[0].imdbId || await getImdbId(parse(torrent.title));
return parseTorrentFiles({ ...torrent, imdbId, files }) return parseTorrentFiles({ ...torrent, imdbId, files })
.then(newFiles => newFiles.map(file => { .then(newFiles => newFiles.map(file => {
if (fileIndexMap[file.fileIndex]) { const fileIndex = file.fileIndex !== undefined ? file.fileIndex : null;
const originalFile = fileIndexMap[file.fileIndex].shift(); const mapping = fileIndexMap[fileIndex];
if (mapping) {
const originalFile = mapping.shift();
if (originalFile) { if (originalFile) {
if (!originalFile.imdbId) { if (!originalFile.imdbId) {
originalFile.imdbId = file.imdbId originalFile.imdbId = file.imdbId
@@ -176,5 +190,7 @@ async function findAllFiles() {
//addMissingEpisodes().then(() => console.log('Finished')); //addMissingEpisodes().then(() => console.log('Finished'));
//findAllFiles().then(() => console.log('Finished')); //findAllFiles().then(() => console.log('Finished'));
//updateMovieCollections().then(() => console.log('Finished')); //updateMovieCollections().then(() => console.log('Finished'));
reapplyEpisodeDecomposing('d71c4fd1cb9bb9c5365a570b903a3a58774f61a5', true).then(() => console.log('Finished')); reapplyEpisodeDecomposing('aec7bcac457ad68924e7119f859cf6fa3878f9f5', false).then(() => console.log('Finished'));
//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); //reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished'));
// reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished'));
//reapplyManualHashes().then(() => console.log('Finished'));

View File

@@ -20,7 +20,7 @@ async function scrape() {
return scrapeLatestTorrents() return scrapeLatestTorrents()
.then(() => { .then(() => {
lastScrape.lastScraped = scrapeStart; lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape); return lastScrape.save();
}) })
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
} }
@@ -40,14 +40,15 @@ async function scrapeLatestTorrents() {
async function scrapeLatestTorrentsForCategory(category, page = 1) { async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`); console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return leetx.browse(({ category, page })) return leetx.browse(({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1) ? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve()) : Promise.resolve());
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve();
});
} }
async function processTorrentRecord(record) { async function processTorrentRecord(record) {

View File

@@ -11,7 +11,7 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json'); const showMappings = require('./horriblesubs_mapping.json');
const NAME = 'HorribleSubs'; const NAME = 'HorribleSubs';
const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days; const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;
const limiter = new Bottleneck({ maxConcurrent: 5 }); const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 }); const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
@@ -26,7 +26,7 @@ async function scrape() {
return _scrapeAllShows() return _scrapeAllShows()
.then(() => { .then(() => {
lastScrape.lastScraped = scrapeStart; lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape); return lastScrape.save();
}) })
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`)); .then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
} else { } else {
@@ -140,7 +140,7 @@ async function _parseShowData(showData) {
return kitsuId; return kitsuId;
}; };
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes) return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
.map((episodeInfo) => episodeInfo.mirrors .map((episodeInfo) => episodeInfo.mirrors
.filter((mirror) => mirror.magnetLink && mirror.magnetLink.length) .filter((mirror) => mirror.magnetLink && mirror.magnetLink.length)
.map((mirror) => ({ .map((mirror) => ({
@@ -169,15 +169,23 @@ async function _parseShowData(showData) {
async function verifyFiles(torrent, files) { async function verifyFiles(torrent, files) {
if (files && files.length) { if (files && files.length) {
const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash }) const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
.then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {})) .then((existing) => existing
.reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
map[fileIndex] = (map[fileIndex] || []).concat(next);
return map;
}, {}))
.catch(() => undefined); .catch(() => undefined);
if (existingFiles && Object.keys(existingFiles).length) { if (existingFiles && Object.keys(existingFiles).length) {
return files return files
.map(file => ({ .map(file => {
...file, const mapping = existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id, if (mapping) {
size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size const originalFile = mapping.shift();
})) return { ...file, id: originalFile.id, size: originalFile.size || file.size };
}
return file;
})
} }
return files; return files;
} }

View File

@@ -20,7 +20,7 @@ async function scrape() {
return scrapeLatestTorrents() return scrapeLatestTorrents()
.then(() => { .then(() => {
lastScrape.lastScraped = scrapeStart; lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape); return lastScrape.save();
}) })
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
} }
@@ -39,14 +39,14 @@ async function scrapeLatestTorrents() {
async function scrapeLatestTorrentsForCategory(category, page = 1) { async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`); console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return kickass.browse(({ category, page })) return kickass.browse(({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1) ? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve()) : Promise.resolve());
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve();
});
} }
async function processTorrentRecord(record) { async function processTorrentRecord(record) {

View File

@@ -16,12 +16,13 @@ const entryLimiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() { async function scrape() {
console.log(`[${moment()}] starting ${NAME} dump scrape...`); console.log(`[${moment()}] starting ${NAME} dump scrape...`);
const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json'); //const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json');
const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json'); const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json').slice(800);
const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds); //const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
return Promise.all(allImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId) return Promise.all(
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))) seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId))
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))
.then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`)); .then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`));
} }

View File

@@ -20,7 +20,7 @@ async function scrape() {
return scrapeLatestTorrents() return scrapeLatestTorrents()
.then(() => { .then(() => {
lastScrape.lastScraped = scrapeStart; lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape); return lastScrape.save();
}) })
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
} }
@@ -64,8 +64,8 @@ async function scrapeLatestTorrentsForCategory(category) {
}))) })))
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))) .then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
.catch(error => { .catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); console.warn(`Failed ${NAME} scrapping for ${category} due: `, error);
return Promise.resolve(); return Promise.resolve([]);
}); });
} }

View File

@@ -6,9 +6,9 @@ const Promises = require('../../lib/promises');
const defaultProxies = [ const defaultProxies = [
'https://thepiratebay.org', 'https://thepiratebay.org',
'https://piratebays.icu', 'https://proxybay.pro',
'https://piratebays.cool', 'https://ukpiratebayproxy.com',
'https://piratebays.life']; 'https://thepiratebayproxy.info'];
const dumpUrl = '/static/dump/csv/'; const dumpUrl = '/static/dump/csv/';
const defaultTimeout = 10000; const defaultTimeout = 10000;
@@ -169,15 +169,18 @@ function parseBody(body) {
$('table[id=\'searchResult\'] tr').each(function () { $('table[id=\'searchResult\'] tr').each(function () {
const name = $(this).find('.detLink').text(); const name = $(this).find('.detLink').text();
const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/); const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
const magnetLink = $(this).find('a[title=\'Download this torrent using magnet\']').attr('href');
if (!name || !sizeMatcher) { if (!name || !sizeMatcher) {
return; return;
} }
torrents.push({ torrents.push({
torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
name: name, name: name,
magnetLink: magnetLink,
infoHash: decode(magnetLink).infoHash,
torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10), seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10),
leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10), leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10),
magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0],
10), 10),
subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0],

View File

@@ -32,7 +32,7 @@ async function scrape() {
return scrapeLatestTorrents() return scrapeLatestTorrents()
.then(() => { .then(() => {
lastScrape.lastScraped = scrapeStart; lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape); return lastScrape.save();
}) })
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
} }
@@ -45,14 +45,14 @@ async function scrapeLatestTorrents() {
async function scrapeLatestTorrentsForCategory(category, page = 1) { async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`); console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return thepiratebay.browse(({ category, page })) return thepiratebay.browse(({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent))))) .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE .then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1) ? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve()) : Promise.resolve());
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve();
});
} }
async function processTorrentRecord(record) { async function processTorrentRecord(record) {