mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
[scraper] updates scrapers and unique index
This commit is contained in:
@@ -20,7 +20,7 @@ async function scrape() {
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return repository.updateProvider(lastScrape);
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
@@ -40,14 +40,15 @@ async function scrapeLatestTorrents() {
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return leetx.browse(({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve())
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve();
|
||||
});
|
||||
: Promise.resolve());
|
||||
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
|
||||
@@ -11,7 +11,7 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata');
|
||||
const showMappings = require('./horriblesubs_mapping.json');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days;
|
||||
const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
@@ -26,7 +26,7 @@ async function scrape() {
|
||||
return _scrapeAllShows()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return repository.updateProvider(lastScrape);
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
|
||||
} else {
|
||||
@@ -140,7 +140,7 @@ async function _parseShowData(showData) {
|
||||
return kitsuId;
|
||||
};
|
||||
|
||||
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
|
||||
return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
|
||||
.map((episodeInfo) => episodeInfo.mirrors
|
||||
.filter((mirror) => mirror.magnetLink && mirror.magnetLink.length)
|
||||
.map((mirror) => ({
|
||||
@@ -169,15 +169,23 @@ async function _parseShowData(showData) {
|
||||
async function verifyFiles(torrent, files) {
|
||||
if (files && files.length) {
|
||||
const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
|
||||
.then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {}))
|
||||
.then((existing) => existing
|
||||
.reduce((map, next) => {
|
||||
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
|
||||
map[fileIndex] = (map[fileIndex] || []).concat(next);
|
||||
return map;
|
||||
}, {}))
|
||||
.catch(() => undefined);
|
||||
if (existingFiles && Object.keys(existingFiles).length) {
|
||||
return files
|
||||
.map(file => ({
|
||||
...file,
|
||||
id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id,
|
||||
size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size
|
||||
}))
|
||||
.map(file => {
|
||||
const mapping = existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
|
||||
if (mapping) {
|
||||
const originalFile = mapping.shift();
|
||||
return { ...file, id: originalFile.id, size: originalFile.size || file.size };
|
||||
}
|
||||
return file;
|
||||
})
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ async function scrape() {
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return repository.updateProvider(lastScrape);
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
@@ -39,14 +39,14 @@ async function scrapeLatestTorrents() {
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return kickass.browse(({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve())
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve();
|
||||
});
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
|
||||
@@ -16,12 +16,13 @@ const entryLimiter = new Bottleneck({ maxConcurrent: 40 });
|
||||
|
||||
async function scrape() {
|
||||
console.log(`[${moment()}] starting ${NAME} dump scrape...`);
|
||||
const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json');
|
||||
const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json');
|
||||
const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
|
||||
//const movieImdbIds = require('./rargb_movie_imdb_ids_2020-03-09.json');
|
||||
const seriesImdbIds = require('./rargb_series_imdb_ids_2020-03-09.json').slice(800);
|
||||
//const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
|
||||
|
||||
return Promise.all(allImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId)
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t))))))))
|
||||
return Promise.all(
|
||||
seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId))
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`));
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ async function scrape() {
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return repository.updateProvider(lastScrape);
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
@@ -64,8 +64,8 @@ async function scrapeLatestTorrentsForCategory(category) {
|
||||
})))
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve();
|
||||
console.warn(`Failed ${NAME} scrapping for ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@ const Promises = require('../../lib/promises');
|
||||
|
||||
const defaultProxies = [
|
||||
'https://thepiratebay.org',
|
||||
'https://piratebays.icu',
|
||||
'https://piratebays.cool',
|
||||
'https://piratebays.life'];
|
||||
'https://proxybay.pro',
|
||||
'https://ukpiratebayproxy.com',
|
||||
'https://thepiratebayproxy.info'];
|
||||
const dumpUrl = '/static/dump/csv/';
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
@@ -169,15 +169,18 @@ function parseBody(body) {
|
||||
$('table[id=\'searchResult\'] tr').each(function () {
|
||||
const name = $(this).find('.detLink').text();
|
||||
const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
|
||||
const magnetLink = $(this).find('a[title=\'Download this torrent using magnet\']').attr('href');
|
||||
if (!name || !sizeMatcher) {
|
||||
return;
|
||||
}
|
||||
torrents.push({
|
||||
torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
|
||||
name: name,
|
||||
magnetLink: magnetLink,
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1],
|
||||
seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10),
|
||||
leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10),
|
||||
magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
|
||||
|
||||
category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0],
|
||||
10),
|
||||
subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0],
|
||||
|
||||
@@ -32,7 +32,7 @@ async function scrape() {
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return repository.updateProvider(lastScrape);
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
@@ -45,14 +45,14 @@ async function scrapeLatestTorrents() {
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return thepiratebay.browse(({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve())
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve();
|
||||
});
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
|
||||
Reference in New Issue
Block a user