mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
updates the package structure and updates initial horriblesubs scraper WIP
This commit is contained in:
125
scrapers/horriblesubs/horriblesubs_api.js
Normal file
125
scrapers/horriblesubs/horriblesubs_api.js
Normal file
@@ -0,0 +1,125 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
|
||||
const defaultUrl = 'https://horriblesubs.info';
|
||||
const defaultTimeout = 5000;
|
||||
|
||||
function allShows(config = {}) {
|
||||
return _getContent('/shows', config)
|
||||
.then(($) => $('div[class="ind-show"]')
|
||||
.map((index, element) => $(element).children('a'))
|
||||
.map((index, element) => ({
|
||||
title: element.attr('title'),
|
||||
url: `${config.proxyUrl || defaultUrl}${element.attr('href')}`
|
||||
})).get());
|
||||
}
|
||||
|
||||
async function showData(showInfo, config = {}) {
|
||||
const showEndpoint = (showInfo.url || showInfo).match(/\/show.+/)[0];
|
||||
const showId = await _getShowId(showEndpoint);
|
||||
const packEntries = await _getAllEntries(showId, 'batch', config);
|
||||
const singleEntries = await _getAllEntries(showId, 'show', config);
|
||||
const title = showInfo.title || singleEntries[0] && singleEntries[0].title;
|
||||
|
||||
return {
|
||||
title: title,
|
||||
url: showInfo.url || showInfo,
|
||||
showId: showId,
|
||||
singleEpisodes: singleEntries,
|
||||
packEpisodes: packEntries
|
||||
};
|
||||
}
|
||||
|
||||
async function getLatestEntries(config = {}) {
|
||||
return _getAllLatestEntries(config)
|
||||
.then((entries) => Promise.all(entries.map((entry) => _findLatestEntry(entry, config))))
|
||||
.then((entries) => entries.filter((entry) => entry))
|
||||
}
|
||||
|
||||
function _getContent(endpoint, config = {},) {
|
||||
const baseUrl = config.proxyUrl || defaultUrl;
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const url = endpoint.startsWith('http')
|
||||
? endpoint.replace(/https?:\/\/[^/]+/, baseUrl)
|
||||
: `${baseUrl}${endpoint}`;
|
||||
|
||||
return needle('get', url, { open_timeout: timeout, follow: 2 })
|
||||
.then((response) => response.body)
|
||||
.then((body) => cheerio.load(body));
|
||||
}
|
||||
|
||||
function _getShowId(showEndpoint) {
|
||||
return _getContent(showEndpoint)
|
||||
.then($ => $('div.entry-content').find('script').html().match(/var hs_showid = (\d+)/)[1]);
|
||||
}
|
||||
|
||||
function _getAllEntries(animeId, type, config, page = 0, autoExtend = true) {
|
||||
const entriesEndpoint = `/api.php?method=getshows&type=${type}&showid=${animeId}&nextid=${page}`;
|
||||
return _getEntries(entriesEndpoint, config)
|
||||
.then((entries) => !autoExtend || entries.length < 12 ? entries :
|
||||
_getAllEntries(animeId, type, config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)))
|
||||
}
|
||||
|
||||
function _getEntries(endpoint, config) {
|
||||
return _getContent(endpoint, config)
|
||||
.then(($) => $('div[class="rls-info-container"]')
|
||||
.map((index, element) => ({
|
||||
title: $(element).find('a[class="rls-label"]').contents()
|
||||
.filter((i, el) => el.nodeType === 3).first().text().trim(),
|
||||
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
|
||||
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
|
||||
mirrors: $(element).find('div[class="rls-links-container"]').children()
|
||||
.map((indexLink, elementLink) => ({
|
||||
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
|
||||
infoHash: decode($(elementLink).find('a[title="Magnet Link"]').attr('href')).infoHash,
|
||||
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
|
||||
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
|
||||
})).get()
|
||||
})).get())
|
||||
}
|
||||
|
||||
function _getAllLatestEntries(config, page = 0) {
|
||||
const pageParam = page === 0 ? '' : `&nextid=${page}`;
|
||||
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
|
||||
return _getContent(entriesEndpoint, config)
|
||||
.then(($) => $('li a')
|
||||
.map((index, element) => ({
|
||||
urlEndpoint: $(element).attr('href'),
|
||||
episode: $(element).find('strong').text()
|
||||
})).get())
|
||||
.then((entries) => entries.length < 12
|
||||
? entries
|
||||
: _getAllLatestEntries(config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)))
|
||||
}
|
||||
|
||||
async function _findLatestEntry(entry, config) {
|
||||
const showId = await _getShowId(entry.urlEndpoint);
|
||||
let foundEntry;
|
||||
let page = 0;
|
||||
let reachedEnd = false;
|
||||
|
||||
while (!foundEntry && !reachedEnd) {
|
||||
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
|
||||
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
|
||||
page = page + 1;
|
||||
reachedEnd = allEntries.length === 0;
|
||||
}
|
||||
|
||||
return foundEntry;
|
||||
}
|
||||
|
||||
function _parseDate(date) {
|
||||
if (date.match(/today/i)) {
|
||||
return moment().toDate();
|
||||
} else if (date.match(/yesterday/i)) {
|
||||
return moment().subtract(1, 'day').toDate();
|
||||
}
|
||||
return moment(date, 'MM/DD/YYYY').toDate();
|
||||
}
|
||||
|
||||
module.exports = { allShows, showData, getLatestEntries, _getShowId };
|
||||
|
||||
5216
scrapers/horriblesubs/horriblesubs_mapping.json
Normal file
5216
scrapers/horriblesubs/horriblesubs_mapping.json
Normal file
File diff suppressed because it is too large
Load Diff
122
scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
122
scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
@@ -0,0 +1,122 @@
|
||||
const fs = require('fs');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const horriblesubs = require('./horriblesubs_api.js');
|
||||
const repository = require('../../lib/repository');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { updateCurrentSeeders } = require('../../lib/torrent');
|
||||
const { parseTorrentFiles } = require('../../lib/torrentFiles');
|
||||
const { getMetadata, getKitsuId } = require('../../lib/metadata');
|
||||
const showMappings = require('./horriblesubs_mapping.json');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
|
||||
|
||||
async function scrape() {
|
||||
const lastScraped = await repository.getProvider({ name: NAME });
|
||||
|
||||
if (!lastScraped.lastScraped) {
|
||||
console.log(`${NAME}: no previous scrapping exist`);
|
||||
await _scrapeAllShows()
|
||||
}
|
||||
}
|
||||
|
||||
async function _scrapeAllShows() {
|
||||
console.log(`${NAME}: getting all shows...`);
|
||||
const shows = await horriblesubs.allShows();
|
||||
|
||||
return Promise.all(shows
|
||||
.slice(0, 20)
|
||||
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
|
||||
.then((showData) => _parseShowData(showData))
|
||||
.catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
async function initMapping() {
|
||||
console.log(`${NAME}: initiating kitsu mapping...`);
|
||||
const shows = await horriblesubs.allShows()
|
||||
.then((shows) => shows.filter((show) => !showMappings[show.title]))
|
||||
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
|
||||
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
|
||||
|
||||
fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
|
||||
if (err) {
|
||||
console.log("An error occurred while writing JSON Object to File.");
|
||||
} else {
|
||||
console.log(`${NAME}: finished kitsu mapping`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function enrichShow(show) {
|
||||
console.log(`${NAME}: getting show info for ${show.title}...`);
|
||||
const showId = await horriblesubs._getShowId(show.url)
|
||||
.catch((error) => show.title);
|
||||
const metadata = await getKitsuId(show.title)
|
||||
.then((kitsuId) => getMetadata(kitsuId))
|
||||
.catch((error) => {
|
||||
console.log(`Failed getting kitsu meta: ${error.message}`);
|
||||
return {};
|
||||
});
|
||||
|
||||
return {
|
||||
showId: showId,
|
||||
...show,
|
||||
kitsu_id: metadata.kitsu_id,
|
||||
kitsuTitle: metadata.name,
|
||||
imdb_id: metadata.imdb_id
|
||||
}
|
||||
}
|
||||
|
||||
async function _parseShowData(showData) {
|
||||
console.log(`${NAME}: scrapping ${showData.title} data...`);
|
||||
const showMapping = showMappings[showData.title];
|
||||
const kitsuId = showMapping && showMapping.kitsu_id;
|
||||
if (!showMapping) {
|
||||
throw new Error(`No kitsu mapping found for ${showData.title}`);
|
||||
}
|
||||
if (!kitsuId) {
|
||||
throw new Error(`No kitsuId found for ${showData.title}`);
|
||||
}
|
||||
|
||||
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
|
||||
.map((episodeInfo) => episodeInfo.mirrors
|
||||
.map((mirror) => ({
|
||||
provider: NAME,
|
||||
...mirror,
|
||||
title: `${episodeInfo.title} ${episodeInfo.episode} [${mirror.resolution}]`,
|
||||
size: 300000000,
|
||||
type: Type.ANIME,
|
||||
uploadDate: episodeInfo.uploadDate,
|
||||
})))
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
.map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
|
||||
.then((torrent) => torrent && updateCurrentSeeders(torrent))
|
||||
.then((torrent) => torrent && parseTorrentFiles(torrent, undefined, kitsuId)
|
||||
.then((files) => verifyFiles(files))
|
||||
.then((files) => repository.createTorrent(torrent)
|
||||
.then(() => files.forEach(file => repository.createFile(file)))
|
||||
.then(() => console.log(`Created entry for ${torrent.title}`)))))))
|
||||
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
|
||||
}
|
||||
|
||||
function verifyFiles(files) {
|
||||
if (files && files.length) {
|
||||
return files;
|
||||
}
|
||||
throw new Error(`No video files found for: ${torrent.title}`);
|
||||
}
|
||||
|
||||
async function checkIfExists(torrent) {
|
||||
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
|
||||
if (!existingTorrent) {
|
||||
return torrent; // no torrent exists yet
|
||||
} else if (existingTorrent.provider === NAME) {
|
||||
return undefined; // torrent by this provider already exists
|
||||
}
|
||||
return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
|
||||
}
|
||||
|
||||
module.exports = { scrape };
|
||||
Reference in New Issue
Block a user