From 051c50de3fc2605ec75708cc131fcfba7e93185e Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Tue, 7 Jan 2020 13:33:46 +0100 Subject: [PATCH] updates the package structure and updates initial horriblesubs scraper WIP --- index.js | 6 +- lib/cache.js | 19 +- lib/metadata.js | 68 ++++--- lib/torrent.js | 6 +- lib/torrentFiles.js | 58 +++--- {scrapers/manual => manual}/manual.js | 8 +- scrapers/horiblesubs_scraper.js | 172 ------------------ .../horriblesubs_api.js} | 33 ++-- .../horriblesubs/horriblesubs_mapping.json | 0 scrapers/horriblesubs/horriblesubs_scraper.js | 122 +++++++++++++ .../thepiratebay_api.js} | 5 +- .../thepiratebay_scraper.js} | 47 +++-- 12 files changed, 264 insertions(+), 280 deletions(-) rename {scrapers/manual => manual}/manual.js (83%) delete mode 100644 scrapers/horiblesubs_scraper.js rename scrapers/{api/horriblesubs.js => horriblesubs/horriblesubs_api.js} (80%) rename horrible_subs_mapping.json => scrapers/horriblesubs/horriblesubs_mapping.json (100%) create mode 100644 scrapers/horriblesubs/horriblesubs_scraper.js rename scrapers/{api/thepiratebay.js => thepiratebay/thepiratebay_api.js} (97%) rename scrapers/{piratebay_dump.js => thepiratebay/thepiratebay_scraper.js} (82%) diff --git a/index.js b/index.js index c094954..e51b924 100644 --- a/index.js +++ b/index.js @@ -2,10 +2,10 @@ require('dotenv').config(); const express = require("express"); const server = express(); const { connect } = require('./lib/repository'); -const tpbDump = require('./scrapers/piratebay_dump'); -const horribleSubsScraper = require('./scrapers/horiblesubs_scraper'); +const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper'); +const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); -const providers = [tpbDump]; +const providers = [horribleSubsScraper]; async function scrape() { providers.forEach((provider) => provider.scrape()); diff --git a/lib/cache.js b/lib/cache.js index 5509694..3542a36 100644 --- a/lib/cache.js +++ b/lib/cache.js @@ -11,6 +11,21 @@ const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days const MONGO_URI = process.env.MONGODB_URI; const cache = initiateCache(); +const torrentFilesCache = initiateTorrentFilesCache(); + +function initiateTorrentFilesCache() { + if (MONGO_URI) { + return cacheManager.caching({ + store: mangodbStore, + uri: MONGO_URI, + options: { + collection: 'cacheManager', + }, + ttl: GLOBAL_TTL, + ignoreCacheErrors: true + }); + } +} function initiateCache() { if (MONGO_URI) { @@ -18,7 +33,7 @@ function initiateCache() { store: mangodbStore, uri: MONGO_URI, options: { - collection: 'cacheManager', + collection: 'torrentio_scraper_collection', }, ttl: GLOBAL_TTL, ignoreCacheErrors: true @@ -32,7 +47,7 @@ function initiateCache() { } function retrieveTorrentFiles(infoHash) { - return cache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`) + return torrentFilesCache.get(`${TORRENT_FILES_KEY_PREFIX}:${infoHash}`) .then((results) => { if (!results) { throw new Error('No cached files found'); diff --git a/lib/metadata.js b/lib/metadata.js index 123db2e..655712b 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -2,52 +2,46 @@ const needle = require('needle'); const nameToImdb = require('name-to-imdb'); const bing = require('nodejs-bing'); const { cacheWrapImdbId, cacheWrapMetadata } = require('./cache'); +const { Type } = require('./types'); const CINEMETA_URL = 'https://v3-cinemeta.strem.io'; const KITSU_URL = 'https://anime-kitsu.now.sh'; -function getMetadata(imdbId, type) { - return cacheWrapMetadata(imdbId, - () => needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 60000 }) - .then((response) => { - const body = response.body; - if (body && body.meta && body.meta.name) { - return { - imdbId: imdbId, - title: body.meta.name, - year: body.meta.year, - genres: body.meta.genres, - totalEpisodes: body.meta.videos && body.meta.videos - .filter(video => video.season > 0).length, - episodeCount: body.meta.videos && Object.values(body.meta.videos - .filter((entry) => entry.season !== 0) - .sort((a, b) => a.season - b.season) - .reduce((map, next) => { - map[next.season] = map[next.season] + 1 || 1; - return map; - }, {})) - }; - } else { - throw new Error('No search results'); - } - }) - .catch((error) => { - throw new Error(`failed cinemeta query ${imdbId} due: ${error.message}`); - })); -} - -function getKitsuMetadata(kitsuId) { - const key = kitsuId.startsWith('kitsu:') ? kitsuId : `kitsu:${kitsuId}`; +function getMetadata(id, type = Type.SERIES ) { + const key = id.match(/^\d+$/) ? `kitsu:${id}` : id; return cacheWrapMetadata(key, () => needle('get', `${KITSU_URL}/meta/series/${key}.json`, { open_timeout: 60000 }) .then((response) => { const body = response.body; if (body && body.meta && body.meta.id) { return { - ...body.meta, - videos: undefined, - totalEpisodes: body.meta.videos && body.meta.videos - .filter(video => video.season > 0).length + kitsuId: body.kitsu_id, + imdbId: body.imdb_id, + title: body.meta.name, + year: body.meta.year, + genres: body.meta.genres, + videos: body.meta.videos && body.meta.videos + .map((video) => video.imdbSeason + ? { + episode: video.episode, + imdbSeason: video.imdbSeason, + imdbEpisode: video.imdbEpisode + } + : { + season: video.season, + episode: video.episode, + kitsuId: video.kitsu_id, + kitsuEpisode: video.kitsuEpisode, + released: video.released + } + ), + episodeCount: body.meta.videos && Object.values(body.meta.videos + .filter((entry) => entry.season !== 0) + .sort((a, b) => a.season - b.season) + .reduce((map, next) => { + map[next.season] = map[next.season] + 1 || 1; + return map; + }, {})) }; } else { throw new Error('No search results'); @@ -101,4 +95,4 @@ async function getKitsuId(title) { })); } -module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuMetadata, getKitsuId }; +module.exports = { escapeTitle, getMetadata, getImdbId, getKitsuId }; diff --git a/lib/torrent.js b/lib/torrent.js index d6ae5cf..49865f2 100644 --- a/lib/torrent.js +++ b/lib/torrent.js @@ -6,6 +6,10 @@ const { retrieveTorrentFiles } = require('./cache'); const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"]; +module.exports.updateCurrentSeeders = function(torrent) { + return Promise.resolve({ ...torrent, seeders: 0}); +}; + module.exports.torrentFiles = function(torrent) { return filesFromTorrentFile(torrent) .catch(() => filesFromTorrentStream(torrent)) @@ -28,7 +32,7 @@ async function filesFromTorrentFile(torrent) { return Promise.reject(new Error("no torrentLink")); } - needle('get', torrent.torrentLink, { open_timeout: 2000 }) + return needle('get', torrent.torrentLink, { open_timeout: 10000 }) .then((response) => { if (!response.body || response.statusCode !== 200) { throw new Error('torrent not found') diff --git a/lib/torrentFiles.js b/lib/torrentFiles.js index 389772a..4ceaa34 100644 --- a/lib/torrentFiles.js +++ b/lib/torrentFiles.js @@ -5,8 +5,14 @@ const { Type } = require('./types'); const MIN_SIZE = 20 * 1024 * 1024; // 20 MB -async function parseTorrentFiles(torrent, imdbId) { +async function parseTorrentFiles(torrent, imdbId, kitsuId) { const parsedTorrentName = parse(torrent.title); + parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/); + const metadata = await getMetadata(kitsuId || imdbId, torrent.type || Type.MOVIE).catch(() => undefined); + + if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) { + throw new Error(`Mismatching entry type for ${torrent.name}: ${torrent.type}!=${metadata.type}`); + } if (torrent.type === Type.MOVIE) { if (parsedTorrentName.complete) { @@ -27,31 +33,20 @@ async function parseTorrentFiles(torrent, imdbId) { }); } - return [{ + return [ { infoHash: torrent.infoHash, title: torrent.title, size: torrent.size, - imdbId: imdbId, - }]; + imdbId: imdbId || metadata && metadata.imdb_id, + kitsuId: kitsuId || metadata && metadata.kitsu_id + } ]; } - if (parsedTorrentName.season && parsedTorrentName.episode) { - return [{ - infoHash: torrent.infoHash, - title: torrent.title, - size: torrent.size, - imdbId: imdbId, - imdbSeason: parsedTorrentName.season, - imdbEpisode: parsedTorrentName.episode - }]; - } - - parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/); - return torrentFiles(torrent) + return getSeriesFiles(torrent, parsedTorrentName) .then((files) => files .filter((file) => file.size > MIN_SIZE) .map((file) => parseSeriesFile(file, parsedTorrentName))) - .then((files) => decomposeAbsoluteEpisodes(files, torrent, imdbId)) + .then((files) => decomposeAbsoluteEpisodes(files, metadata)) .then((files) => Promise.all(files.map(file => file.isMovie ? mapSeriesMovie(file, torrent.infoHash) : mapSeriesEpisode(file, torrent.infoHash, imdbId)))) @@ -62,6 +57,18 @@ async function parseTorrentFiles(torrent, imdbId) { }); } +async function getSeriesFiles(torrent, parsedTorrentName) { + if (parsedTorrentName.episode || parsedTorrentName.date) { + return [ { + name: torrent.title, + path: torrent.title, + size: torrent.size + } ]; + } + + return torrentFiles(torrent); +} + async function mapSeriesEpisode(file, infoHash, imdbId) { if (!file.episodes) { return Promise.resolve([]); @@ -69,21 +76,22 @@ async function mapSeriesEpisode(file, infoHash, imdbId) { return Promise.resolve(file.episodes.map(episode => ({ infoHash: infoHash, fileIndex: file.fileIndex, - title: file.name, + title: file.path || file.name, size: file.size, imdbId: imdbId, imdbSeason: file.season, - imdbEpisode: episode}))) + imdbEpisode: episode + }))) } async function mapSeriesMovie(file, infoHash) { - return findMovieImdbId(file).then((imdbId) => [{ + return findMovieImdbId(file).then((imdbId) => [ { infoHash: infoHash, fileIndex: file.fileIndex, title: file.name, size: file.size, imdbId: imdbId - }]) + } ]) } function parseSeriesFile(file, parsedTorrentName) { @@ -96,7 +104,8 @@ function parseSeriesFile(file, parsedTorrentName) { const pathInfo = parse(folders[folders.length - 2]); fileInfo.season = pathInfo.season; } - fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season && !fileInfo.episodes || !!fileInfo.year; + fileInfo.isMovie = parsedTorrentName.hasMovies && !fileInfo.season && + (!fileInfo.episodes || !!fileInfo.year || !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i)); return { ...file, ...fileInfo }; } @@ -111,12 +120,11 @@ function findMovieImdbId(title) { return getImdbId(searchQuery).catch((error) => undefined); } -async function decomposeAbsoluteEpisodes(files, torrent, imdbId) { +async function decomposeAbsoluteEpisodes(files, metadata) { if (files.every((file) => !file.episodes || file.episodes.every((ep) => ep < 100))) { return files; // nothing to decompose } - const metadata = await getMetadata(imdbId, torrent.type || Type.MOVIE); // decompose if season is inside path, but individual files are concatenated ex. 101 (S01E01) files .filter(file => file.season && metadata.episodeCount[file.season] < 100) diff --git a/scrapers/manual/manual.js b/manual/manual.js similarity index 83% rename from scrapers/manual/manual.js rename to manual/manual.js index 6e502ed..71f3ecb 100644 --- a/scrapers/manual/manual.js +++ b/manual/manual.js @@ -1,11 +1,11 @@ require('dotenv').config(); -const repository = require('../../lib/repository'); -const { parseTorrentFiles } = require('../../lib/torrentFiles'); -const { Type } = require('../../lib/types'); +const repository = require('../lib/repository'); +const { parseTorrentFiles } = require('../lib/torrentFiles'); +const { Type } = require('../lib/types'); async function addMissingEpisodes() { const torrent = { infoHash: '0ec780c2c7f8d5b38e61827f0b53c77c3d22f955' }; - const torrentFiles = await require('../../lib/torrent').torrentFiles(torrent); + const torrentFiles = await require('../lib/torrent').torrentFiles(torrent); const storedFiles = await repository.getFiles(torrent) .then((files) => files.reduce((map, next) => (map[next.fileIndex] = next, map), {})); const imdbId = Object.values(storedFiles)[0].imdbId; diff --git a/scrapers/horiblesubs_scraper.js b/scrapers/horiblesubs_scraper.js deleted file mode 100644 index 7e5eb43..0000000 --- a/scrapers/horiblesubs_scraper.js +++ /dev/null @@ -1,172 +0,0 @@ -const moment = require('moment'); -const fs = require('fs'); -const needle = require('needle'); -const Bottleneck = require('bottleneck'); -const { parse } = require('parse-torrent-title'); -const decode = require('magnet-uri'); -const horriblesubs = require('./api/horriblesubs'); -const { Type } = require('../lib/types'); -const { torrentFiles, currentSeeders } = require('../lib/torrent'); -const repository = require('../lib/repository'); -const { getImdbId, getMetadata, getKitsuId, getKitsuMetadata } = require('../lib/metadata'); - -const NAME = 'HorribleSubs'; - -const limiter = new Bottleneck({maxConcurrent: 5}); -const entryLimiter = new Bottleneck({maxConcurrent: 20}); - -async function scrape() { - const lastScraped = await repository.getProvider({ name: NAME }); - - if (!lastScraped.lastScraped) { - console.log(`${NAME}: no previous scrapping exist`); - await _scrapeAllShows() - } -} - -async function _scrapeAllShows() { - initMapping(); - // console.log(`${NAME}: getting all shows...`); - // const shows = await horriblesubs.allShows(); - - // Promise.all(shows - // .slice(0, 20) - // //.filter(show => show.url.includes('piece')) - // .map((show) => limiter.schedule(() => horriblesubs.showData(show) - // .then((showData) => _parseShowData(showData)) - // .catch((err) => console.log(err))))); -} - -async function initMapping() { - console.log(`${NAME}: initiating kitsu mapping...`); - const currentMapping = require('../horrible_subs_mapping'); - const mappings = Object.values(currentMapping); - const shows = await horriblesubs.allShows() - .then((shows) => shows.filter((show) => !mappings.find((mapping) => mapping.title === show.title))) - .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show))))) - .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), currentMapping)); - - fs.writeFile("./horrible_subs_mapping.json", JSON.stringify(shows), 'utf8', function (err) { - if (err) { - console.log("An error occurred while writing JSON Object to File."); - } - }); - console.log(`${NAME}: finished kitsu mapping`); -} - -async function enrichShow(show) { - console.log(`${NAME}: getting show info for ${show.title}...`); - const showId = await horriblesubs._getShowId(show.url) - .catch((error) => show.title); - const metadata = await getKitsuId(show.title) - .then((kitsuId) => getKitsuMetadata(kitsuId)) - .catch((error) => { - console.log(`Failed getting kitsu meta: ${error.message}`); - return {}; - }); - - return { - showId: showId, - ...show, - kitsu_id: metadata.kitsu_id, - kitsuTitle: metadata.name, - kitsuSlug: metadata.slug, - imdb_id: metadata.imdb_id - } -} - -async function _parseShowData(showData) { - console.log(`${NAME}: scrapping ${showData.title} data...`); - const imdbId = hardcodedShows[showData.showId] || await getImdbId({ - name: showData.title.replace(/\W+/g, ' ').toLowerCase(), - type: 'series' - }).catch(() => undefined); - const metadata = imdbId && await getMetadata(imdbId, 'series') || {}; - - return Promise.all([ - showData.singleEpisodes - .map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror)))) - .reduce((a, b) => a.concat(b), []), - showData.packEpisodes - .map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror)))) - .reduce((a, b) => a.concat(b), []) - ].reduce((a, b) => a.concat(b), [])) - .then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent))); -} - -async function _constructSingleEntry(metadata, single, mirror) { - mirror.infoHash = decode(mirror.magnetLink).infoHash; - const seeders = await currentSeeders(mirror); - const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/); - const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season - const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode - const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode); - const title = `${single.title} ${single.episode} [${mirror.resolution}]`; - const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode}; - - return { - infoHash: mirror.infoHash, - provider: NAME, - title: title, - type: Type.ANIME, - imdbId: metadata.imdbId, - uploadDate: single.uploadDate, - seeders: seeders, - files: [file] - } -} - -async function _constructPackEntry(metadata, pack, mirror) { - mirror.infoHash = decode(mirror.magnetLink).infoHash; - const seeders = await currentSeeders(mirror); - const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/); - const xSeason = seasonMatch && parseInt(seasonMatch[1]); - - const files = await torrentFiles(mirror) - .then((files) => files.map((file) => { - const title = file.path.match(/[^\/]+$/)[0]; - const titleInfo = parse(title.replace(pack.title, '')); - return titleInfo.episodes - .map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode)) - .map((actual) => ({ - title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode - })); - })) - .then((files) => files.reduce((a, b) => a.concat(b), [])) - .catch(() => []); - - return { - infoHash: mirror.infoHash, - provider: NAME, - title: `${pack.title} ${pack.episode} [${mirror.resolution}]`, - type: 'anime', - imdbId: metadata.imdbId, - uploadDate: pack.uploadDate, - seeders: seeders, - files: files - } -} - -function actualSeasonEpisode(metadata, xSeason, xEpisode) { - if (xSeason) { - return { - season: xSeason, - episode: xEpisode, - absoluteEpisode: metadata.episodeCount && metadata.episodeCount - .slice(0, xSeason - 1) - .reduce((a, b) => a + b, xEpisode), - } - } else if (metadata.episodeCount) { - return metadata.episodeCount - .reduce((epInfo, epCount) => { - if (epInfo.episode > epCount) { - epInfo.season = epInfo.season + 1; - epInfo.episode = epInfo.episode - epCount; - } - return epInfo; - }, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode }) - } - return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode } -} - -module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/api/horriblesubs.js b/scrapers/horriblesubs/horriblesubs_api.js similarity index 80% rename from scrapers/api/horriblesubs.js rename to scrapers/horriblesubs/horriblesubs_api.js index bc76cdb..0ba0202 100644 --- a/scrapers/api/horriblesubs.js +++ b/scrapers/horriblesubs/horriblesubs_api.js @@ -1,13 +1,14 @@ const cheerio = require('cheerio'); const needle = require('needle'); const moment = require('moment'); +const decode = require('magnet-uri'); const defaultUrl = 'https://horriblesubs.info'; const defaultTimeout = 5000; function allShows(config = {}) { return _getContent('/shows', config) - .then(($) => $('div[class="ind-show"]') + .then(($) => $('div[class="ind-show"]') .map((index, element) => $(element).children('a')) .map((index, element) => ({ title: element.attr('title'), @@ -71,11 +72,12 @@ function _getEntries(endpoint, config) { episode: $(element).find('a[class="rls-label"]').find('strong').text(), uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()), mirrors: $(element).find('div[class="rls-links-container"]').children() - .map((indexLink, elementLink) => ({ - resolution: $(elementLink).attr('id').match(/\d+p$/)[0], - magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'), - torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href') - })).get() + .map((indexLink, elementLink) => ({ + resolution: $(elementLink).attr('id').match(/\d+p$/)[0], + infoHash: decode($(elementLink).find('a[title="Magnet Link"]').attr('href')).infoHash, + magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'), + torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href') + })).get() })).get()) } @@ -84,13 +86,14 @@ function _getAllLatestEntries(config, page = 0) { const entriesEndpoint = `/api.php?method=getlatest${pageParam}`; return _getContent(entriesEndpoint, config) .then(($) => $('li a') - .map((index, element) => ({ - urlEndpoint: $(element).attr('href'), - episode: $(element).find('strong').text() - })).get()) - .then((entries) => entries.length < 12 ? entries : - _getAllLatestEntries(config, page + 1) - .then((nextEntries) => entries.concat(nextEntries))) + .map((index, element) => ({ + urlEndpoint: $(element).attr('href'), + episode: $(element).find('strong').text() + })).get()) + .then((entries) => entries.length < 12 + ? entries + : _getAllLatestEntries(config, page + 1) + .then((nextEntries) => entries.concat(nextEntries))) } async function _findLatestEntry(entry, config) { @@ -99,8 +102,8 @@ async function _findLatestEntry(entry, config) { let page = 0; let reachedEnd = false; - while(!foundEntry && !reachedEnd) { - const allEntries = await _getAllEntries(showId, 'show', config, page,false); + while (!foundEntry && !reachedEnd) { + const allEntries = await _getAllEntries(showId, 'show', config, page, false); foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0]; page = page + 1; reachedEnd = allEntries.length === 0; diff --git a/horrible_subs_mapping.json b/scrapers/horriblesubs/horriblesubs_mapping.json similarity index 100% rename from horrible_subs_mapping.json rename to scrapers/horriblesubs/horriblesubs_mapping.json diff --git a/scrapers/horriblesubs/horriblesubs_scraper.js b/scrapers/horriblesubs/horriblesubs_scraper.js new file mode 100644 index 0000000..a809427 --- /dev/null +++ b/scrapers/horriblesubs/horriblesubs_scraper.js @@ -0,0 +1,122 @@ +const fs = require('fs'); +const Bottleneck = require('bottleneck'); +const { parse } = require('parse-torrent-title'); +const horriblesubs = require('./horriblesubs_api.js'); +const repository = require('../../lib/repository'); +const { Type } = require('../../lib/types'); +const { updateCurrentSeeders } = require('../../lib/torrent'); +const { parseTorrentFiles } = require('../../lib/torrentFiles'); +const { getMetadata, getKitsuId } = require('../../lib/metadata'); +const showMappings = require('./horriblesubs_mapping.json'); + +const NAME = 'HorribleSubs'; + +const limiter = new Bottleneck({ maxConcurrent: 5 }); +const entryLimiter = new Bottleneck({ maxConcurrent: 20 }); + +async function scrape() { + const lastScraped = await repository.getProvider({ name: NAME }); + + if (!lastScraped.lastScraped) { + console.log(`${NAME}: no previous scrapping exist`); + await _scrapeAllShows() + } +} + +async function _scrapeAllShows() { + console.log(`${NAME}: getting all shows...`); + const shows = await horriblesubs.allShows(); + + return Promise.all(shows + .slice(0, 20) + .map((show) => limiter.schedule(() => horriblesubs.showData(show) + .then((showData) => _parseShowData(showData)) + .catch((err) => console.log(err))))); +} + +async function initMapping() { + console.log(`${NAME}: initiating kitsu mapping...`); + const shows = await horriblesubs.allShows() + .then((shows) => shows.filter((show) => !showMappings[show.title])) + .then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show))))) + .then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings)); + + fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) { + if (err) { + console.log("An error occurred while writing JSON Object to File."); + } else { + console.log(`${NAME}: finished kitsu mapping`); + } + }); +} + +async function enrichShow(show) { + console.log(`${NAME}: getting show info for ${show.title}...`); + const showId = await horriblesubs._getShowId(show.url) + .catch((error) => show.title); + const metadata = await getKitsuId(show.title) + .then((kitsuId) => getMetadata(kitsuId)) + .catch((error) => { + console.log(`Failed getting kitsu meta: ${error.message}`); + return {}; + }); + + return { + showId: showId, + ...show, + kitsu_id: metadata.kitsu_id, + kitsuTitle: metadata.name, + imdb_id: metadata.imdb_id + } +} + +async function _parseShowData(showData) { + console.log(`${NAME}: scrapping ${showData.title} data...`); + const showMapping = showMappings[showData.title]; + const kitsuId = showMapping && showMapping.kitsu_id; + if (!showMapping) { + throw new Error(`No kitsu mapping found for ${showData.title}`); + } + if (!kitsuId) { + throw new Error(`No kitsuId found for ${showData.title}`); + } + + return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes) + .map((episodeInfo) => episodeInfo.mirrors + .map((mirror) => ({ + provider: NAME, + ...mirror, + title: `${episodeInfo.title} ${episodeInfo.episode} [${mirror.resolution}]`, + size: 300000000, + type: Type.ANIME, + uploadDate: episodeInfo.uploadDate, + }))) + .reduce((a, b) => a.concat(b), []) + .map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent) + .then((torrent) => torrent && updateCurrentSeeders(torrent)) + .then((torrent) => torrent && parseTorrentFiles(torrent, undefined, kitsuId) + .then((files) => verifyFiles(files)) + .then((files) => repository.createTorrent(torrent) + .then(() => files.forEach(file => repository.createFile(file))) + .then(() => console.log(`Created entry for ${torrent.title}`))))))) + .then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`)); +} + +function verifyFiles(files) { + if (files && files.length) { + return files; + } + throw new Error(`No video files found for: ${torrent.title}`); +} + +async function checkIfExists(torrent) { + const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined); + if (!existingTorrent) { + return torrent; // no torrent exists yet + } else if (existingTorrent.provider === NAME) { + return undefined; // torrent by this provider already exists + } + return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders }; +} + +module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/api/thepiratebay.js b/scrapers/thepiratebay/thepiratebay_api.js similarity index 97% rename from scrapers/api/thepiratebay.js rename to scrapers/thepiratebay/thepiratebay_api.js index adf0250..998532d 100644 --- a/scrapers/api/thepiratebay.js +++ b/scrapers/thepiratebay/thepiratebay_api.js @@ -156,7 +156,8 @@ function parseBody(body) { $('table[id=\'searchResult\'] tr').each(function() { const name = $(this).find('.detLink').text(); - if (!name || name === 'Do NOT download any torrent before hiding your IP with a VPN.') { + const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/); + if (!name || !sizeMatcher) { return; } torrents.push({ @@ -166,7 +167,7 @@ function parseBody(body) { magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'), category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10), - size: parseSize($(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/)[1]) + size: parseSize(sizeMatcher[1]) }); }); resolve(torrents); diff --git a/scrapers/piratebay_dump.js b/scrapers/thepiratebay/thepiratebay_scraper.js similarity index 82% rename from scrapers/piratebay_dump.js rename to scrapers/thepiratebay/thepiratebay_scraper.js index b242aab..367cd5a 100644 --- a/scrapers/piratebay_dump.js +++ b/scrapers/thepiratebay/thepiratebay_scraper.js @@ -5,12 +5,12 @@ const { ungzip } = require('node-gzip'); const LineByLineReader = require('line-by-line'); const fs = require('fs'); const { parse } = require('parse-torrent-title'); -const pirata = require('./api/thepiratebay'); +const thepiratebay = require('./thepiratebay_api.js'); const bing = require('nodejs-bing'); -const { Type } = require('../lib/types'); -const repository = require('../lib/repository'); -const { getImdbId, escapeTitle } = require('../lib/metadata'); -const { parseTorrentFiles } = require('../lib/torrentFiles'); +const { Type } = require('../../lib/types'); +const repository = require('../../lib/repository'); +const { getImdbId, escapeTitle } = require('../../lib/metadata'); +const { parseTorrentFiles } = require('../../lib/torrentFiles'); const NAME = 'ThePirateBay'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; @@ -21,7 +21,7 @@ async function scrape() { const lastScraped = await repository.getProvider({ name: NAME }); const lastDump = { updatedAt: 2147000000 }; const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); - //const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); + //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) { console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); @@ -83,16 +83,16 @@ async function scrape() { } } const allowedCategories = [ - pirata.Categories.VIDEO.MOVIES, - pirata.Categories.VIDEO.MOVIES_HD, - pirata.Categories.VIDEO.MOVIES_DVDR, - pirata.Categories.VIDEO.MOVIES_3D, - pirata.Categories.VIDEO.TV_SHOWS, - pirata.Categories.VIDEO.TV_SHOWS_HD + thepiratebay.Categories.VIDEO.MOVIES, + thepiratebay.Categories.VIDEO.MOVIES_HD, + thepiratebay.Categories.VIDEO.MOVIES_DVDR, + thepiratebay.Categories.VIDEO.MOVIES_3D, + thepiratebay.Categories.VIDEO.TV_SHOWS, + thepiratebay.Categories.VIDEO.TV_SHOWS_HD ]; const seriesCategories = [ - pirata.Categories.VIDEO.TV_SHOWS, - pirata.Categories.VIDEO.TV_SHOWS_HD + thepiratebay.Categories.VIDEO.TV_SHOWS, + thepiratebay.Categories.VIDEO.TV_SHOWS_HD ]; async function processTorrentRecord(record) { const alreadyExists = await repository.getSkipTorrent(record) @@ -132,7 +132,7 @@ async function processTorrentRecord(record) { seeders: torrentFound.seeders, }; - if (!imdbId) { + if (!imdbId && !titleInfo.complete) { console.log(`imdbId not found: ${torrentFound.name}`); repository.createFailedImdbTorrent(torrent); return; @@ -145,8 +145,8 @@ async function processTorrentRecord(record) { } repository.createTorrent(torrent) - .then(() => files.forEach(file => repository.createFile(file))); - console.log(`Created entry for ${torrentFound.name}`); + .then(() => files.forEach(file => repository.createFile(file))) + .then(() => console.log(`Created entry for ${torrentFound.name}`)); } async function findTorrent(record) { @@ -158,7 +158,7 @@ async function findTorrentInSource(record) { let page = 0; let torrentFound; while (!torrentFound && page < 5) { - const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page }); + const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page }); torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0]; page = torrents.length === 0 ? 1000 : page + 1; } @@ -178,10 +178,19 @@ async function findTorrentViaBing(record) { } return result.link.match(/torrent\/(\w+)\//)[1]; }) - .then((torrentId) => pirata.torrent(torrentId)) + .then((torrentId) => thepiratebay.torrent(torrentId)) } function downloadDump(dump) { + try { + if (fs.existsSync(CSV_FILE_PATH)) { + console.log('dump file already exist...'); + return; + } + } catch(err) { + console.error(err) + } + console.log('downloading dump file...'); return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' }) .then((response) => response.body)