updates the package structure and updates initial horriblesubs scraper WIP
This commit is contained in:
6
index.js
6
index.js
@@ -2,10 +2,10 @@ require('dotenv').config();
|
||||
const express = require("express");
|
||||
const server = express();
|
||||
const { connect } = require('./lib/repository');
|
||||
const tpbDump = require('./scrapers/piratebay_dump');
|
||||
const horribleSubsScraper = require('./scrapers/horiblesubs_scraper');
|
||||
const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper');
|
||||
const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper');
|
||||
|
||||
const providers = [tpbDump];
|
||||
const providers = [horribleSubsScraper];
|
||||
|
||||
async function scrape() {
|
||||
providers.forEach((provider) => provider.scrape());
|
||||
|
||||
19
lib/cache.js
19
lib/cache.js
@@ -11,6 +11,21 @@ const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days
|
||||
const MONGO_URI = process.env.MONGODB_URI;
|
||||
|
||||
const cache = initiateCache();
|
||||
const torrentFilesCache = initiateTorrentFilesCache();
|
||||
|
||||
function initiateTorrentFilesCache() {
|
||||
if (MONGO_URI) {
|
||||
return cacheManager.caching({
|
||||
store: mangodbStore,
|
||||
uri: MONGO_URI,
|
||||
options: {
|
||||
collection: 'cacheManager',
|
||||
},
|
||||
ttl: GLOBAL_TTL,
|
||||
ignoreCacheErrors: true
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function initiateCache() {
|
||||
if (MONGO_URI) {
|
||||
@@ -18,7 +33,7 @@ function initiateCache() {
|
||||
store: mangodbStore,
|
||||
uri: MONGO_URI,
|
||||
options: {
|
||||
collection: 'cacheManager',
|
||||
collection: 'torrentio_scraper_collection',
|
||||
},
|
||||
ttl: GLOBAL_TTL,
|
||||
ignoreCacheErrors: true
|
||||
@@ -32,7 +47,7 @@ function initiateCache() {
|
||||
}
|
||||
|
||||
function retrieveTorrentFiles(infoHash) {
|
||||
return cache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`)
|
||||
return torrentFilesCache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`)
|
||||
.then((results) => {
|
||||
if (!results) {
|
||||
throw new Error('No cached files found');
|
||||
|
||||
@@ -2,52 +2,46 @@ const needle = require('needle');
|
||||
const nameToImdb = require('name-to-imdb');
|
||||
const bing = require('nodejs-bing');
|
||||
const { cacheWrapImdbId, cacheWrapMetadata } = require('./cache');
|
||||
const { Type } = require('./types');
|
||||
|
||||
const CINEMETA_URL = 'https://v3-cinemeta.strem.io';
|
||||
const KITSU_URL = 'https://anime-kitsu.now.sh';
|
||||
|
||||
function getMetadata(imdbId, type) {
|
||||
return cacheWrapMetadata(imdbId,
|
||||
() => needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 60000 })
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (body && body.meta && body.meta.name) {
|
||||
return {
|
||||
imdbId: imdbId,
|
||||
title: body.meta.name,
|
||||
year: body.meta.year,
|
||||
genres: body.meta.genres,
|
||||
totalEpisodes: body.meta.videos && body.meta.videos
|
||||
.filter(video => video.season > 0).length,
|
||||
episodeCount: body.meta.videos && Object.values(body.meta.videos
|
||||
.filter((entry) => entry.season !== 0)
|
||||
.sort((a, b) => a.season - b.season)
|
||||
.reduce((map, next) => {
|
||||
map[next.season] = map[next.season] + 1 || 1;
|
||||
return map;
|
||||
}, {}))
|
||||
};
|
||||
} else {
|
||||
throw new Error('No search results');
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
throw new Error(`failed cinemeta query ${imdbId} due: ${error.message}`);
|
||||
}));
|
||||
}
|
||||
|
||||
function getKitsuMetadata(kitsuId) {
|
||||
const key = kitsuId.startsWith('kitsu:') ? kitsuId : `kitsu:${kitsuId}`;
|
||||
function getMetadata(id, type = Type.SERIES ) {
|
||||
const key = id.match(/^\d+$/) ? `kitsu:${id}` : id;
|
||||
return cacheWrapMetadata(key,
|
||||
() => needle('get', `${KITSU_URL}/meta/series/${key}.json`, { open_timeout: 60000 })
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (body && body.meta && body.meta.id) {
|
||||
return {
|
||||
...body.meta,
|
||||
videos: undefined,
|
||||
totalEpisodes: body.meta.videos && body.meta.videos
|
||||
.filter(video => video.season > 0).length
|
||||
kitsuId: body.kitsu_id,
|
||||
imdbId: body.imdb_id,
|
||||
title: body.meta.name,
|
||||
year: body.meta.year,
|
||||
genres: body.meta.genres,
|
||||
videos: body.meta.videos && body.meta.videos
|
||||
.map((video) => video.imdbSeason
|
||||
? {
|
||||
episode: video.episode,
|
||||
imdbSeason: video.imdbSeason,
|
||||
imdbEpisode: video.imdbEpisode
|
||||
}
|
||||
: {
|
||||
season: video.season,
|
||||
episode: video.episode,
|
||||
kitsuId: video.kitsu_id,
|
||||
kitsuEpisode: video.kitsuEpisode,
|
||||
released: video.released
|
||||
}
|
||||
),
|
||||
episodeCount: body.meta.videos && Object.values(body.meta.videos
|
||||
.filter((entry) => entry.season !== 0)
|
||||
.sort((a, b) => a.season - b.season)
|
||||
.reduce((map, next) => {
|
||||
map[next.season] = map[next.season] + 1 || 1;
|
||||
return map;
|
||||
}, {}))
|
||||
};
|
||||
} else {
|
||||
throw new Error('No search results');
|
||||
@@ -101,4 +95,4 @@ async function getKitsuId(title) {
|
||||
}));
|
||||
}
|
||||
|
||||
module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuMetadata, getKitsuId };
|
||||
module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuId };
|
||||
|
||||
@@ -6,6 +6,10 @@ const { retrieveTorrentFiles } = require('./cache');
|
||||
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
|
||||
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"];
|
||||
|
||||
module.exports.updateCurrentSeeders = function(torrent) {
|
||||
return Promise.resolve({ ...torrent, seeders: 0});
|
||||
};
|
||||
|
||||
module.exports.torrentFiles = function(torrent) {
|
||||
return filesFromTorrentFile(torrent)
|
||||
.catch(() => filesFromTorrentStream(torrent))
|
||||
@@ -28,7 +32,7 @@ async function filesFromTorrentFile(torrent) {
|
||||
return Promise.reject(new Error("no torrentLink"));
|
||||
}
|
||||
|
||||
needle('get', torrent.torrentLink, { open_timeout: 2000 })
|
||||
return needle('get', torrent.torrentLink, { open_timeout: 10000 })
|
||||
.then((response) => {
|
||||
if (!response.body || response.statusCode !== 200) {
|
||||
throw new Error('torrent not found')
|
||||
|
||||
@@ -5,8 +5,14 @@ const { Type } = require('./types');
|
||||
|
||||
const MIN_SIZE = 20 * 1024 * 1024; // 20 MB
|
||||
|
||||
async function parseTorrentFiles(torrent, imdbId) {
|
||||
async function parseTorrentFiles(torrent, imdbId, kitsuId) {
|
||||
const parsedTorrentName = parse(torrent.title);
|
||||
parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/);
|
||||
const metadata = await getMetadata(kitsuId || imdbId, torrent.type || Type.MOVIE).catch(() => undefined);
|
||||
|
||||
if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) {
|
||||
throw new Error(`Mismatching entry type for ${torrent.name}: ${torrent.type}!=${metadata.type}`);
|
||||
}
|
||||
|
||||
if (torrent.type === Type.MOVIE) {
|
||||
if (parsedTorrentName.complete) {
|
||||
@@ -27,31 +33,20 @@ async function parseTorrentFiles(torrent, imdbId) {
|
||||
});
|
||||
}
|
||||
|
||||
return [{
|
||||
return [ {
|
||||
infoHash: torrent.infoHash,
|
||||
title: torrent.title,
|
||||
size: torrent.size,
|
||||
imdbId: imdbId,
|
||||
}];
|
||||
imdbId: imdbId || metadata && metadata.imdb_id,
|
||||
kitsuId: kitsuId || metadata && metadata.kitsu_id
|
||||
} ];
|
||||
}
|
||||
|
||||
if (parsedTorrentName.season && parsedTorrentName.episode) {
|
||||
return [{
|
||||
infoHash: torrent.infoHash,
|
||||
title: torrent.title,
|
||||
size: torrent.size,
|
||||
imdbId: imdbId,
|
||||
imdbSeason: parsedTorrentName.season,
|
||||
imdbEpisode: parsedTorrentName.episode
|
||||
}];
|
||||
}
|
||||
|
||||
parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/);
|
||||
return torrentFiles(torrent)
|
||||
return getSeriesFiles(torrent, parsedTorrentName)
|
||||
.then((files) => files
|
||||
.filter((file) => file.size > MIN_SIZE)
|
||||
.map((file) => parseSeriesFile(file, parsedTorrentName)))
|
||||
.then((files) => decomposeAbsoluteEpisodes(files, torrent, imdbId))
|
||||
.then((files) => decomposeAbsoluteEpisodes(files, metadata))
|
||||
.then((files) => Promise.all(files.map(file => file.isMovie
|
||||
? mapSeriesMovie(file, torrent.infoHash)
|
||||
: mapSeriesEpisode(file, torrent.infoHash, imdbId))))
|
||||
@@ -62,6 +57,18 @@ async function parseTorrentFiles(torrent, imdbId) {
|
||||
});
|
||||
}
|
||||
|
||||
async function getSeriesFiles(torrent, parsedTorrentName) {
|
||||
if (parsedTorrentName.episode || parsedTorrentName.date) {
|
||||
return [ {
|
||||
name: torrent.title,
|
||||
path: torrent.title,
|
||||
size: torrent.size
|
||||
} ];
|
||||
}
|
||||
|
||||
return torrentFiles(torrent);
|
||||
}
|
||||
|
||||
async function mapSeriesEpisode(file, infoHash, imdbId) {
|
||||
if (!file.episodes) {
|
||||
return Promise.resolve([]);
|
||||
@@ -69,21 +76,22 @@ async function mapSeriesEpisode(file, infoHash, imdbId) {
|
||||
return Promise.resolve(file.episodes.map(episode => ({
|
||||
infoHash: infoHash,
|
||||
fileIndex: file.fileIndex,
|
||||
title: file.name,
|
||||
title: file.path || file.name,
|
||||
size: file.size,
|
||||
imdbId: imdbId,
|
||||
imdbSeason: file.season,
|
||||
imdbEpisode: episode})))
|
||||
imdbEpisode: episode
|
||||
})))
|
||||
}
|
||||
|
||||
async function mapSeriesMovie(file, infoHash) {
|
||||
return findMovieImdbId(file).then((imdbId) => [{
|
||||
return findMovieImdbId(file).then((imdbId) => [ {
|
||||
infoHash: infoHash,
|
||||
fileIndex: file.fileIndex,
|
||||
title: file.name,
|
||||
size: file.size,
|
||||
imdbId: imdbId
|
||||
}])
|
||||
} ])
|
||||
}
|
||||
|
||||
function parseSeriesFile(file, parsedTorrentName) {
|
||||
@@ -96,7 +104,8 @@ function parseSeriesFile(file, parsedTorrentName) {
|
||||
const pathInfo = parse(folders[folders.length - 2]);
|
||||
fileInfo.season = pathInfo.season;
|
||||
}
|
||||
fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season && !fileInfo.episodes || !!fileInfo.year;
|
||||
fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season &&
|
||||
(!fileInfo.episodes || !!fileInfo.year || !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i));
|
||||
|
||||
return { ...file, ...fileInfo };
|
||||
}
|
||||
@@ -111,12 +120,11 @@ function findMovieImdbId(title) {
|
||||
return getImdbId(searchQuery).catch((error) => undefined);
|
||||
}
|
||||
|
||||
async function decomposeAbsoluteEpisodes(files, torrent, imdbId) {
|
||||
async function decomposeAbsoluteEpisodes(files, metadata) {
|
||||
if (files.every((file) => !file.episodes || file.episodes.every((ep) => ep < 100))) {
|
||||
return files; // nothing to decompose
|
||||
}
|
||||
|
||||
const metadata = await getMetadata(imdbId, torrent.type || Type.MOVIE);
|
||||
// decompose if season is inside path, but individual files are concatenated ex. 101 (S01E01)
|
||||
files
|
||||
.filter(file => file.season && metadata.episodeCount[file.season] < 100)
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
require('dotenv').config();
|
||||
const repository = require('../../lib/repository');
|
||||
const { parseTorrentFiles } = require('../../lib/torrentFiles');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../lib/repository');
|
||||
const { parseTorrentFiles } = require('../lib/torrentFiles');
|
||||
const { Type } = require('../lib/types');
|
||||
|
||||
async function addMissingEpisodes() {
|
||||
const torrent = { infoHash: '0ec780c2c7f8d5b38e61827f0b53c77c3d22f955' };
|
||||
const torrentFiles = await require('../../lib/torrent').torrentFiles(torrent);
|
||||
const torrentFiles = await require('../lib/torrent').torrentFiles(torrent);
|
||||
const storedFiles = await repository.getFiles(torrent)
|
||||
.then((files) => files.reduce((map, next) => (map[next.fileIndex] = next, map), {}));
|
||||
const imdbId = Object.values(storedFiles)[0].imdbId;
|
||||
@@ -1,172 +0,0 @@
|
||||
const moment = require('moment');
|
||||
const fs = require('fs');
|
||||
const needle = require('needle');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const decode = require('magnet-uri');
|
||||
const horriblesubs = require('./api/horriblesubs');
|
||||
const { Type } = require('../lib/types');
|
||||
const { torrentFiles, currentSeeders } = require('../lib/torrent');
|
||||
const repository = require('../lib/repository');
|
||||
const { getImdbId, getMetadata, getKitsuId, getKitsuMetadata } = require('../lib/metadata');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
|
||||
const limiter = new Bottleneck({maxConcurrent: 5});
|
||||
const entryLimiter = new Bottleneck({maxConcurrent: 20});
|
||||
|
||||
async function scrape() {
|
||||
const lastScraped = await repository.getProvider({ name: NAME });
|
||||
|
||||
if (!lastScraped.lastScraped) {
|
||||
console.log(`${NAME}: no previous scrapping exist`);
|
||||
await _scrapeAllShows()
|
||||
}
|
||||
}
|
||||
|
||||
async function _scrapeAllShows() {
|
||||
initMapping();
|
||||
// console.log(`${NAME}: getting all shows...`);
|
||||
// const shows = await horriblesubs.allShows();
|
||||
|
||||
// Promise.all(shows
|
||||
// .slice(0, 20)
|
||||
// //.filter(show => show.url.includes('piece'))
|
||||
// .map((show) => limiter.schedule(() => horriblesubs.showData(show)
|
||||
// .then((showData) => _parseShowData(showData))
|
||||
// .catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
async function initMapping() {
|
||||
console.log(`${NAME}: initiating kitsu mapping...`);
|
||||
const currentMapping = require('../horrible_subs_mapping');
|
||||
const mappings = Object.values(currentMapping);
|
||||
const shows = await horriblesubs.allShows()
|
||||
.then((shows) => shows.filter((show) => !mappings.find((mapping) => mapping.title === show.title)))
|
||||
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
|
||||
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), currentMapping));
|
||||
|
||||
fs.writeFile("./horrible_subs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
|
||||
if (err) {
|
||||
console.log("An error occurred while writing JSON Object to File.");
|
||||
}
|
||||
});
|
||||
console.log(`${NAME}: finished kitsu mapping`);
|
||||
}
|
||||
|
||||
async function enrichShow(show) {
|
||||
console.log(`${NAME}: getting show info for ${show.title}...`);
|
||||
const showId = await horriblesubs._getShowId(show.url)
|
||||
.catch((error) => show.title);
|
||||
const metadata = await getKitsuId(show.title)
|
||||
.then((kitsuId) => getKitsuMetadata(kitsuId))
|
||||
.catch((error) => {
|
||||
console.log(`Failed getting kitsu meta: ${error.message}`);
|
||||
return {};
|
||||
});
|
||||
|
||||
return {
|
||||
showId: showId,
|
||||
...show,
|
||||
kitsu_id: metadata.kitsu_id,
|
||||
kitsuTitle: metadata.name,
|
||||
kitsuSlug: metadata.slug,
|
||||
imdb_id: metadata.imdb_id
|
||||
}
|
||||
}
|
||||
|
||||
async function _parseShowData(showData) {
|
||||
console.log(`${NAME}: scrapping ${showData.title} data...`);
|
||||
const imdbId = hardcodedShows[showData.showId] || await getImdbId({
|
||||
name: showData.title.replace(/\W+/g, ' ').toLowerCase(),
|
||||
type: 'series'
|
||||
}).catch(() => undefined);
|
||||
const metadata = imdbId && await getMetadata(imdbId, 'series') || {};
|
||||
|
||||
return Promise.all([
|
||||
showData.singleEpisodes
|
||||
.map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror))))
|
||||
.reduce((a, b) => a.concat(b), []),
|
||||
showData.packEpisodes
|
||||
.map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror))))
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
].reduce((a, b) => a.concat(b), []))
|
||||
.then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent)));
|
||||
}
|
||||
|
||||
async function _constructSingleEntry(metadata, single, mirror) {
|
||||
mirror.infoHash = decode(mirror.magnetLink).infoHash;
|
||||
const seeders = await currentSeeders(mirror);
|
||||
const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/);
|
||||
const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season
|
||||
const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode
|
||||
const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode);
|
||||
const title = `${single.title} ${single.episode} [${mirror.resolution}]`;
|
||||
const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode};
|
||||
|
||||
return {
|
||||
infoHash: mirror.infoHash,
|
||||
provider: NAME,
|
||||
title: title,
|
||||
type: Type.ANIME,
|
||||
imdbId: metadata.imdbId,
|
||||
uploadDate: single.uploadDate,
|
||||
seeders: seeders,
|
||||
files: [file]
|
||||
}
|
||||
}
|
||||
|
||||
async function _constructPackEntry(metadata, pack, mirror) {
|
||||
mirror.infoHash = decode(mirror.magnetLink).infoHash;
|
||||
const seeders = await currentSeeders(mirror);
|
||||
const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/);
|
||||
const xSeason = seasonMatch && parseInt(seasonMatch[1]);
|
||||
|
||||
const files = await torrentFiles(mirror)
|
||||
.then((files) => files.map((file) => {
|
||||
const title = file.path.match(/[^\/]+$/)[0];
|
||||
const titleInfo = parse(title.replace(pack.title, ''));
|
||||
return titleInfo.episodes
|
||||
.map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode))
|
||||
.map((actual) => ({
|
||||
title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode
|
||||
}));
|
||||
}))
|
||||
.then((files) => files.reduce((a, b) => a.concat(b), []))
|
||||
.catch(() => []);
|
||||
|
||||
return {
|
||||
infoHash: mirror.infoHash,
|
||||
provider: NAME,
|
||||
title: `${pack.title} ${pack.episode} [${mirror.resolution}]`,
|
||||
type: 'anime',
|
||||
imdbId: metadata.imdbId,
|
||||
uploadDate: pack.uploadDate,
|
||||
seeders: seeders,
|
||||
files: files
|
||||
}
|
||||
}
|
||||
|
||||
function actualSeasonEpisode(metadata, xSeason, xEpisode) {
|
||||
if (xSeason) {
|
||||
return {
|
||||
season: xSeason,
|
||||
episode: xEpisode,
|
||||
absoluteEpisode: metadata.episodeCount && metadata.episodeCount
|
||||
.slice(0, xSeason - 1)
|
||||
.reduce((a, b) => a + b, xEpisode),
|
||||
}
|
||||
} else if (metadata.episodeCount) {
|
||||
return metadata.episodeCount
|
||||
.reduce((epInfo, epCount) => {
|
||||
if (epInfo.episode > epCount) {
|
||||
epInfo.season = epInfo.season + 1;
|
||||
epInfo.episode = epInfo.episode - epCount;
|
||||
}
|
||||
return epInfo;
|
||||
}, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode })
|
||||
}
|
||||
return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode }
|
||||
}
|
||||
|
||||
module.exports = { scrape };
|
||||
@@ -1,13 +1,14 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
|
||||
const defaultUrl = 'https://horriblesubs.info';
|
||||
const defaultTimeout = 5000;
|
||||
|
||||
function allShows(config = {}) {
|
||||
return _getContent('/shows', config)
|
||||
.then(($) => $('div[class="ind-show"]')
|
||||
.then(($) => $('div[class="ind-show"]')
|
||||
.map((index, element) => $(element).children('a'))
|
||||
.map((index, element) => ({
|
||||
title: element.attr('title'),
|
||||
@@ -71,11 +72,12 @@ function _getEntries(endpoint, config) {
|
||||
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
|
||||
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
|
||||
mirrors: $(element).find('div[class="rls-links-container"]').children()
|
||||
.map((indexLink, elementLink) => ({
|
||||
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
|
||||
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
|
||||
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
|
||||
})).get()
|
||||
.map((indexLink, elementLink) => ({
|
||||
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
|
||||
infoHash: decode($(elementLink).find('a[title="Magnet Link"]').attr('href')).infoHash,
|
||||
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
|
||||
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
|
||||
})).get()
|
||||
})).get())
|
||||
}
|
||||
|
||||
@@ -84,13 +86,14 @@ function _getAllLatestEntries(config, page = 0) {
|
||||
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
|
||||
return _getContent(entriesEndpoint, config)
|
||||
.then(($) => $('li a')
|
||||
.map((index, element) => ({
|
||||
urlEndpoint: $(element).attr('href'),
|
||||
episode: $(element).find('strong').text()
|
||||
})).get())
|
||||
.then((entries) => entries.length < 12 ? entries :
|
||||
_getAllLatestEntries(config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)))
|
||||
.map((index, element) => ({
|
||||
urlEndpoint: $(element).attr('href'),
|
||||
episode: $(element).find('strong').text()
|
||||
})).get())
|
||||
.then((entries) => entries.length < 12
|
||||
? entries
|
||||
: _getAllLatestEntries(config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)))
|
||||
}
|
||||
|
||||
async function _findLatestEntry(entry, config) {
|
||||
@@ -99,8 +102,8 @@ async function _findLatestEntry(entry, config) {
|
||||
let page = 0;
|
||||
let reachedEnd = false;
|
||||
|
||||
while(!foundEntry && !reachedEnd) {
|
||||
const allEntries = await _getAllEntries(showId, 'show', config, page,false);
|
||||
while (!foundEntry && !reachedEnd) {
|
||||
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
|
||||
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
|
||||
page = page + 1;
|
||||
reachedEnd = allEntries.length === 0;
|
||||
122
scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
122
scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
@@ -0,0 +1,122 @@
|
||||
const fs = require('fs');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const horriblesubs = require('./horriblesubs_api.js');
|
||||
const repository = require('../../lib/repository');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { updateCurrentSeeders } = require('../../lib/torrent');
|
||||
const { parseTorrentFiles } = require('../../lib/torrentFiles');
|
||||
const { getMetadata, getKitsuId } = require('../../lib/metadata');
|
||||
const showMappings = require('./horriblesubs_mapping.json');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
|
||||
|
||||
async function scrape() {
|
||||
const lastScraped = await repository.getProvider({ name: NAME });
|
||||
|
||||
if (!lastScraped.lastScraped) {
|
||||
console.log(`${NAME}: no previous scrapping exist`);
|
||||
await _scrapeAllShows()
|
||||
}
|
||||
}
|
||||
|
||||
async function _scrapeAllShows() {
|
||||
console.log(`${NAME}: getting all shows...`);
|
||||
const shows = await horriblesubs.allShows();
|
||||
|
||||
return Promise.all(shows
|
||||
.slice(0, 20)
|
||||
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
|
||||
.then((showData) => _parseShowData(showData))
|
||||
.catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
async function initMapping() {
|
||||
console.log(`${NAME}: initiating kitsu mapping...`);
|
||||
const shows = await horriblesubs.allShows()
|
||||
.then((shows) => shows.filter((show) => !showMappings[show.title]))
|
||||
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
|
||||
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
|
||||
|
||||
fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
|
||||
if (err) {
|
||||
console.log("An error occurred while writing JSON Object to File.");
|
||||
} else {
|
||||
console.log(`${NAME}: finished kitsu mapping`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function enrichShow(show) {
|
||||
console.log(`${NAME}: getting show info for ${show.title}...`);
|
||||
const showId = await horriblesubs._getShowId(show.url)
|
||||
.catch((error) => show.title);
|
||||
const metadata = await getKitsuId(show.title)
|
||||
.then((kitsuId) => getMetadata(kitsuId))
|
||||
.catch((error) => {
|
||||
console.log(`Failed getting kitsu meta: ${error.message}`);
|
||||
return {};
|
||||
});
|
||||
|
||||
return {
|
||||
showId: showId,
|
||||
...show,
|
||||
kitsu_id: metadata.kitsu_id,
|
||||
kitsuTitle: metadata.name,
|
||||
imdb_id: metadata.imdb_id
|
||||
}
|
||||
}
|
||||
|
||||
async function _parseShowData(showData) {
|
||||
console.log(`${NAME}: scrapping ${showData.title} data...`);
|
||||
const showMapping = showMappings[showData.title];
|
||||
const kitsuId = showMapping && showMapping.kitsu_id;
|
||||
if (!showMapping) {
|
||||
throw new Error(`No kitsu mapping found for ${showData.title}`);
|
||||
}
|
||||
if (!kitsuId) {
|
||||
throw new Error(`No kitsuId found for ${showData.title}`);
|
||||
}
|
||||
|
||||
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
|
||||
.map((episodeInfo) => episodeInfo.mirrors
|
||||
.map((mirror) => ({
|
||||
provider: NAME,
|
||||
...mirror,
|
||||
title: `${episodeInfo.title} ${episodeInfo.episode} [${mirror.resolution}]`,
|
||||
size: 300000000,
|
||||
type: Type.ANIME,
|
||||
uploadDate: episodeInfo.uploadDate,
|
||||
})))
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
.map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
|
||||
.then((torrent) => torrent && updateCurrentSeeders(torrent))
|
||||
.then((torrent) => torrent && parseTorrentFiles(torrent, undefined, kitsuId)
|
||||
.then((files) => verifyFiles(files))
|
||||
.then((files) => repository.createTorrent(torrent)
|
||||
.then(() => files.forEach(file => repository.createFile(file)))
|
||||
.then(() => console.log(`Created entry for ${torrent.title}`)))))))
|
||||
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
|
||||
}
|
||||
|
||||
function verifyFiles(files) {
|
||||
if (files && files.length) {
|
||||
return files;
|
||||
}
|
||||
throw new Error(`No video files found for: ${torrent.title}`);
|
||||
}
|
||||
|
||||
async function checkIfExists(torrent) {
|
||||
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
|
||||
if (!existingTorrent) {
|
||||
return torrent; // no torrent exists yet
|
||||
} else if (existingTorrent.provider === NAME) {
|
||||
return undefined; // torrent by this provider already exists
|
||||
}
|
||||
return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
|
||||
}
|
||||
|
||||
module.exports = { scrape };
|
||||
@@ -156,7 +156,8 @@ function parseBody(body) {
|
||||
|
||||
$('table[id=\'searchResult\'] tr').each(function() {
|
||||
const name = $(this).find('.detLink').text();
|
||||
if (!name || name === 'Do NOT download any torrent before hiding your IP with a VPN.') {
|
||||
const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
|
||||
if (!name || !sizeMatcher) {
|
||||
return;
|
||||
}
|
||||
torrents.push({
|
||||
@@ -166,7 +167,7 @@ function parseBody(body) {
|
||||
magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
|
||||
category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10),
|
||||
subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10),
|
||||
size: parseSize($(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/)[1])
|
||||
size: parseSize(sizeMatcher[1])
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
@@ -5,12 +5,12 @@ const { ungzip } = require('node-gzip');
|
||||
const LineByLineReader = require('line-by-line');
|
||||
const fs = require('fs');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const pirata = require('./api/thepiratebay');
|
||||
const thepiratebay = require('./thepiratebay_api.js');
|
||||
const bing = require('nodejs-bing');
|
||||
const { Type } = require('../lib/types');
|
||||
const repository = require('../lib/repository');
|
||||
const { getImdbId, escapeTitle } = require('../lib/metadata');
|
||||
const { parseTorrentFiles } = require('../lib/torrentFiles');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const { getImdbId, escapeTitle } = require('../../lib/metadata');
|
||||
const { parseTorrentFiles } = require('../../lib/torrentFiles');
|
||||
|
||||
const NAME = 'ThePirateBay';
|
||||
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
||||
@@ -21,7 +21,7 @@ async function scrape() {
|
||||
const lastScraped = await repository.getProvider({ name: NAME });
|
||||
const lastDump = { updatedAt: 2147000000 };
|
||||
const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
||||
//const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
||||
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
||||
|
||||
if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) {
|
||||
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
|
||||
@@ -83,16 +83,16 @@ async function scrape() {
|
||||
}
|
||||
}
|
||||
const allowedCategories = [
|
||||
pirata.Categories.VIDEO.MOVIES,
|
||||
pirata.Categories.VIDEO.MOVIES_HD,
|
||||
pirata.Categories.VIDEO.MOVIES_DVDR,
|
||||
pirata.Categories.VIDEO.MOVIES_3D,
|
||||
pirata.Categories.VIDEO.TV_SHOWS,
|
||||
pirata.Categories.VIDEO.TV_SHOWS_HD
|
||||
thepiratebay.Categories.VIDEO.MOVIES,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_HD,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_3D,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
const seriesCategories = [
|
||||
pirata.Categories.VIDEO.TV_SHOWS,
|
||||
pirata.Categories.VIDEO.TV_SHOWS_HD
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
async function processTorrentRecord(record) {
|
||||
const alreadyExists = await repository.getSkipTorrent(record)
|
||||
@@ -132,7 +132,7 @@ async function processTorrentRecord(record) {
|
||||
seeders: torrentFound.seeders,
|
||||
};
|
||||
|
||||
if (!imdbId) {
|
||||
if (!imdbId && !titleInfo.complete) {
|
||||
console.log(`imdbId not found: ${torrentFound.name}`);
|
||||
repository.createFailedImdbTorrent(torrent);
|
||||
return;
|
||||
@@ -145,8 +145,8 @@ async function processTorrentRecord(record) {
|
||||
}
|
||||
|
||||
repository.createTorrent(torrent)
|
||||
.then(() => files.forEach(file => repository.createFile(file)));
|
||||
console.log(`Created entry for ${torrentFound.name}`);
|
||||
.then(() => files.forEach(file => repository.createFile(file)))
|
||||
.then(() => console.log(`Created entry for ${torrentFound.name}`));
|
||||
}
|
||||
|
||||
async function findTorrent(record) {
|
||||
@@ -158,7 +158,7 @@ async function findTorrentInSource(record) {
|
||||
let page = 0;
|
||||
let torrentFound;
|
||||
while (!torrentFound && page < 5) {
|
||||
const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
|
||||
const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
|
||||
torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0];
|
||||
page = torrents.length === 0 ? 1000 : page + 1;
|
||||
}
|
||||
@@ -178,10 +178,19 @@ async function findTorrentViaBing(record) {
|
||||
}
|
||||
return result.link.match(/torrent\/(\w+)\//)[1];
|
||||
})
|
||||
.then((torrentId) => pirata.torrent(torrentId))
|
||||
.then((torrentId) => thepiratebay.torrent(torrentId))
|
||||
}
|
||||
|
||||
function downloadDump(dump) {
|
||||
try {
|
||||
if (fs.existsSync(CSV_FILE_PATH)) {
|
||||
console.log('dump file already exist...');
|
||||
return;
|
||||
}
|
||||
} catch(err) {
|
||||
console.error(err)
|
||||
}
|
||||
|
||||
console.log('downloading dump file...');
|
||||
return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' })
|
||||
.then((response) => response.body)
|
||||
Reference in New Issue
Block a user