adds kickass scrapper

This commit is contained in:
TheBeastLT
2020-03-08 20:06:32 +01:00
parent 853c21472a
commit 6cc0c5dc64
11 changed files with 404 additions and 56 deletions

View File

@@ -3,7 +3,7 @@ const needle = require('needle');
const moment = require('moment');
const defaultUrl = 'https://horriblesubs.info';
const defaultTimeout = 5000;
const defaultTimeout = 10000;
function allShows(config = {}) {
return _getContent('/shows', config)
@@ -112,7 +112,15 @@ async function _findLatestEntry(entry, config) {
reachedEnd = allEntries.length === 0;
}
return foundEntry;
if (!foundEntry) {
return;
}
return {
title: foundEntry.title,
url: entry.urlEndpoint,
showId: showId,
singleEpisodes: [foundEntry]
};
}
function _parseDate(date) {

View File

@@ -1,4 +1,5 @@
const fs = require('fs');
const moment = require('moment');
const Bottleneck = require('bottleneck');
const decode = require('magnet-uri');
const horriblesubs = require('./horriblesubs_api.js');
@@ -10,25 +11,43 @@ const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json');
const NAME = 'HorribleSubs';
const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days;
const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
const lastScraped = lastScrape.lastScraped && moment.unix(lastScrape.lastScraped);
if (!lastScraped.lastScraped) {
console.log(`${NAME}: no previous scrapping exist`);
//await _scrapeAllShows()
if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
return _scrapeAllShows()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape);
})
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
} else {
console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
return _scrapeLatestEntries()
.then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
}
}
async function _scrapeLatestEntries() {
const latestEntries = await horriblesubs.getLatestEntries();
return Promise.all(latestEntries
.map((entryData) => limiter.schedule(() => _parseShowData(entryData)
.catch((err) => console.log(err)))));
}
async function _scrapeAllShows() {
console.log(`${NAME}: getting all shows...`);
const shows = await horriblesubs.allShows();
return Promise.all(shows
.slice(0, 5)
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
.then((showData) => _parseShowData(showData))
.catch((err) => console.log(err)))));
@@ -94,7 +113,7 @@ async function _parseShowData(showData) {
}
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
const kitsuIdsMapping = kitsuId.length && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
.then((metas) => metas.reduce((map, meta) => {
const epOffset = Object.keys(map).length;
[...Array(meta.totalCount).keys()]