moves scraper to a subpackage

This commit is contained in:
TheBeastLT
2020-03-10 15:12:18 +01:00
parent dff66d8fb2
commit 58aba322c2
30 changed files with 50 additions and 28 deletions

View File

@@ -0,0 +1,136 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const defaultUrl = 'https://horriblesubs.info';
const defaultTimeout = 10000;
function allShows(config = {}) {
return _getContent('/shows', config)
.then(($) => $('div[class="ind-show"]')
.map((index, element) => $(element).children('a'))
.map((index, element) => ({
title: element.attr('title'),
url: `${config.proxyUrl || defaultUrl}${element.attr('href')}`
})).get());
}
async function showData(showInfo, config = {}) {
const showEndpoint = (showInfo.url || showInfo).match(/\/show.+/)[0];
const title = showInfo.title;
const showId = await _getShowId(showEndpoint);
const packEntries = await _getShowEntries(showId, title, 'batch', config);
const singleEntries = await _getShowEntries(showId, title, 'show', config);
return {
title: title,
url: showInfo.url || showInfo,
showId: showId,
singleEpisodes: singleEntries,
packEpisodes: packEntries
};
}
async function getLatestEntries(config = {}) {
return _getAllLatestEntries(config)
.then((entries) => Promise.all(entries.map((entry) => _findLatestEntry(entry, config))))
.then((entries) => entries.filter((entry) => entry))
}
function _getContent(endpoint, config = {},) {
const baseUrl = config.proxyUrl || defaultUrl;
const timeout = config.timeout || defaultTimeout;
const url = endpoint.startsWith('http')
? endpoint.replace(/https?:\/\/[^/]+/, baseUrl)
: `${baseUrl}${endpoint}`;
return needle('get', url, { open_timeout: timeout, follow: 2 })
.then((response) => response.body)
.then((body) => cheerio.load(body));
}
function _getShowId(showEndpoint) {
return _getContent(showEndpoint)
.then($ => $('div.entry-content').find('script').html().match(/var hs_showid = (\d+)/)[1]);
}
function _getShowEntries(animeId, animeTitle, type, config) {
return _getAllEntries(animeId, type, config)
.then((entries) => entries.filter((entry) => entry.title === animeTitle));
}
function _getAllEntries(animeId, type, config, page = 0, autoExtend = true) {
const entriesEndpoint = `/api.php?method=getshows&type=${type}&showid=${animeId}&nextid=${page}`;
return _getEntries(entriesEndpoint, config)
.then((entries) => !autoExtend || !entries.length ? entries :
_getAllEntries(animeId, type, config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)));
}
function _getEntries(endpoint, config) {
return _getContent(endpoint, config)
.then(($) => $('div[class="rls-info-container"]')
.map((index, element) => ({
title: $(element).find('a[class="rls-label"]').contents()
.filter((i, el) => el.nodeType === 3).first().text().trim(),
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
mirrors: $(element).find('div[class="rls-links-container"]').children()
.map((indexLink, elementLink) => ({
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
})).get()
})).get());
}
function _getAllLatestEntries(config, page = 0) {
const pageParam = page === 0 ? '' : `&nextid=${page}`;
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
return _getContent(entriesEndpoint, config)
.then(($) => $('li a')
.map((index, element) => ({
urlEndpoint: $(element).attr('href'),
episode: $(element).find('strong').text()
})).get())
.then((entries) => entries.length < 12
? entries
: _getAllLatestEntries(config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)));
}
async function _findLatestEntry(entry, config) {
const showId = await _getShowId(entry.urlEndpoint);
let foundEntry;
let page = 0;
let reachedEnd = false;
while (!foundEntry && !reachedEnd) {
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
page = page + 1;
reachedEnd = allEntries.length === 0;
}
if (!foundEntry) {
return;
}
return {
title: foundEntry.title,
url: entry.urlEndpoint,
showId: showId,
singleEpisodes: [foundEntry]
};
}
function _parseDate(date) {
if (date.match(/today/i)) {
return moment().toDate();
} else if (date.match(/yesterday/i)) {
return moment().subtract(1, 'day').toDate();
}
return moment(date, 'MM/DD/YYYY').toDate();
}
module.exports = { allShows, showData, getLatestEntries, _getShowId };

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,189 @@
const fs = require('fs');
const moment = require('moment');
const Bottleneck = require('bottleneck');
const decode = require('magnet-uri');
const horriblesubs = require('./horriblesubs_api.js');
const repository = require('../../lib/repository');
const { Type } = require('../../lib/types');
const { updateCurrentSeeders } = require('../../lib/torrent');
const { parseTorrentFiles } = require('../../lib/torrentFiles');
const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json');
const NAME = 'HorribleSubs';
const NEXT_FULL_SCRAPE_OFFSET = 3 * 24 * 60 * 60; // 3 days;
const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
const lastScraped = lastScrape.lastScraped && moment.unix(lastScrape.lastScraped);
if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
return _scrapeAllShows()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return repository.updateProvider(lastScrape);
})
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
} else {
console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
return _scrapeLatestEntries()
.then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
}
}
async function _scrapeLatestEntries() {
const latestEntries = await horriblesubs.getLatestEntries();
return Promise.all(latestEntries
.map((entryData) => limiter.schedule(() => _parseShowData(entryData)
.catch((err) => console.log(err)))));
}
async function _scrapeAllShows() {
const shows = await horriblesubs.allShows();
return Promise.all(shows
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
.then((showData) => _parseShowData(showData))
.catch((err) => console.log(err)))));
}
async function compareSearchKitsuIds() {
console.log(`${NAME}: initiating kitsu compare...`);
const shows = await horriblesubs.allShows()
.then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
const incorrect = shows.filter(
(show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
const incorrectRatio = incorrect.length / shows.length;
console.log(incorrect);
console.log(`Ratio: ${incorrectRatio}`);
}
async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`);
const shows = await horriblesubs.allShows()
.then((shows) => shows.filter((show) => !showMappings[show.title]))
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
if (err) {
console.log("An error occurred while writing JSON Object to File.");
} else {
console.log(`${NAME}: finished kitsu mapping`);
}
});
}
async function enrichShow(show) {
console.log(`${NAME}: getting show info for ${show.title}...`);
const showId = await horriblesubs._getShowId(show.url)
.catch(() => show.title);
const metadata = await getKitsuId({ title: show.title })
.then((kitsuId) => getMetadata(kitsuId))
.catch((error) => {
console.log(`Failed getting kitsu meta: ${error.message}`);
return {};
});
return {
showId: showId,
kitsu_id: metadata.kitsuId,
...show,
kitsuTitle: metadata.title,
imdb_id: metadata.imdbId
}
}
async function _parseShowData(showData) {
console.log(`${NAME}: scrapping ${showData.title} data...`);
const showMapping = showMappings[showData.title];
const kitsuId = showMapping && showMapping.kitsu_id;
if (!showMapping) {
throw new Error(`No kitsu mapping found for ${showData.title}`);
}
if (!kitsuId) {
throw new Error(`No kitsuId found for ${showData.title}`);
}
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
.then((metas) => metas.reduce((map, meta) => {
const epOffset = Object.keys(map).length;
[...Array(meta.totalCount).keys()]
.map(ep => ep + 1)
.forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
return map;
}, {})) || {};
const formatTitle = (episodeInfo, mirror) => {
const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
if (mapping) {
return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
}
return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
};
const getKitsuId = inputEpisode => {
const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
const episode = parseInt(episodeString, 10);
return kitsuIdsMapping[episode] && kitsuIdsMapping[episode].kitsuId || kitsuId;
};
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
.map((episodeInfo) => episodeInfo.mirrors
.map((mirror) => ({
provider: NAME,
...mirror,
infoHash: decode(mirror.magnetLink).infoHash,
trackers: decode(mirror.magnetLink).tr.join(','),
title: formatTitle(episodeInfo, mirror),
size: 300000000,
type: Type.ANIME,
kitsuId: getKitsuId(episodeInfo.episode),
uploadDate: episodeInfo.uploadDate,
})))
.reduce((a, b) => a.concat(b), [])
.map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
.then((torrent) => torrent && updateCurrentSeeders(torrent))
.then((torrent) => torrent && parseTorrentFiles(torrent)
.then((files) => verifyFiles(torrent, files))
.then((files) => repository.createTorrent(torrent)
.then(() => files.forEach(file => repository.createFile(file)))
.then(() => console.log(`Created entry for ${torrent.title}`)))))))
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
}
async function verifyFiles(torrent, files) {
if (files && files.length) {
const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
.then((existing) => existing.reduce((map, file) => (map[file.fileIndex] = file, map), {}))
.catch(() => undefined);
if (existingFiles && Object.keys(existingFiles).length) {
return files
.map(file => ({
...file,
id: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].id,
size: existingFiles[file.fileIndex] && existingFiles[file.fileIndex].size || file.size
}))
}
return files;
}
return Promise.reject(`No video files found for: ${torrent.title}`);
}
async function checkIfExists(torrent) {
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
if (!existingTorrent) {
return torrent; // no torrent exists yet
} else if (existingTorrent.provider === NAME) {
return undefined; // torrent by this provider already exists
}
return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
}
module.exports = { scrape };