From f795f5315c76423f2946ad0c4646b911737d4083 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sat, 17 Oct 2020 21:15:05 +0200 Subject: [PATCH] [scraper] adds nyaa pantsu scraper --- package-lock.json | 61 +++++++++++- package.json | 3 +- scraper/lib/metadata.js | 25 +++-- scraper/lib/parseHelper.js | 65 +++++++++++++ .../{request_helper.js => requestHelper.js} | 0 scraper/lib/torrentFiles.js | 48 ++-------- scraper/manual/manual.js | 10 +- scraper/scheduler/scrapers.js | 4 +- scraper/scrapers/1337x/1337x_api.js | 2 +- scraper/scrapers/eztv/eztv_api.js | 2 +- .../scrapers/nyaapantsu/nyaa_pantsu_api.js | 63 ++++++++++++ .../nyaapantsu/nyaa_pantsu_scraper.js | 96 +++++++++++++++++++ scraper/scrapers/yts/yts_api.js | 2 +- 13 files changed, 316 insertions(+), 65 deletions(-) create mode 100644 scraper/lib/parseHelper.js rename scraper/lib/{request_helper.js => requestHelper.js} (100%) create mode 100644 scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js create mode 100644 scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js diff --git a/package-lock.json b/package-lock.json index 9f20d99..a4e8d8b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1259,6 +1259,11 @@ "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" }, + "lodash.omit": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.omit/-/lodash.omit-4.5.0.tgz", + "integrity": "sha1-brGa5aHuHdnfC5aeZs4Lf6MLXmA=" + }, "lodash.pick": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/lodash.pick/-/lodash.pick-4.4.0.tgz", @@ -1584,6 +1589,58 @@ "boolbase": "~1.0.0" } }, + "nyaapi": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/nyaapi/-/nyaapi-2.3.3.tgz", + "integrity": "sha512-lOHNNCut7TXmRYoVtCQYLUrm4Wlakyq+iFdAQYAFuUB9Z/i+hQdMJQjDlYn46oJtERUd7taauS3wr2fljwzrVA==", + "requires": { + "cheerio": "^1.0.0-rc.3", + "lodash.omit": "^4.5.0", + "request": "^2.88.0", + "request-promise": "^4.2.5" + }, + "dependencies": { + "cheerio": { + "version": "1.0.0-rc.3", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.3.tgz", + "integrity": "sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==", + "requires": { + "css-select": "~1.2.0", + "dom-serializer": "~0.1.1", + "entities": "~1.1.1", + "htmlparser2": "^3.9.1", + "lodash": "^4.15.0", + "parse5": "^3.0.1" + } + }, + "request-promise": { + "version": "4.2.6", + "resolved": "https://registry.npmjs.org/request-promise/-/request-promise-4.2.6.tgz", + "integrity": "sha512-HCHI3DJJUakkOr8fNoCc73E5nU5bqITjOYFMDrKHYOXWXrgD/SBaC7LjwuPymUprRyuF06UK7hd/lMHkmUXglQ==", + "requires": { + "bluebird": "^3.5.0", + "request-promise-core": "1.1.4", + "stealthy-require": "^1.1.1", + "tough-cookie": "^2.3.3" + } + }, + "request-promise-core": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.4.tgz", + "integrity": "sha512-TTbAfBBRdWD7aNNOoVOBH4pN/KigV6LyapYNNlAPA8JwbovRti1E88m3sYAwsLi5ryhPKsE9APwnjFTgdUjTpw==", + "requires": { + "lodash": "^4.17.19" + }, + "dependencies": { + "lodash": { + "version": "4.17.20", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.20.tgz", + "integrity": "sha512-PlhdFcillOINfeV7Ni6oF1TAEayyZBoZ8bcshTHqOYJYlrqzRK5hagpagky5o4HfCzzd1TRkXPMFq6cKk9rGmA==" + } + } + } + } + }, "oauth-sign": { "version": "0.9.0", "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", @@ -1687,8 +1744,8 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#2ce5aebff836e1dcc48459ae7b930b8661c979ec", - "from": "git://github.com/TheBeastLT/parse-torrent-title.git#2ce5aebff836e1dcc48459ae7b930b8661c979ec", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#ed1772a7ad0014101d79dca7fa01269eb041a0b6", + "from": "git://github.com/TheBeastLT/parse-torrent-title.git#ed1772a7ad0014101d79dca7fa01269eb041a0b6", "requires": { "moment": "^2.24.0" } diff --git a/package.json b/package.json index 29ed486..87211b4 100644 --- a/package.json +++ b/package.json @@ -30,8 +30,9 @@ "node-gzip": "^1.1.2", "node-schedule": "^1.3.2", "nodejs-bing": "^0.1.0", + "nyaapi": "^2.3.3", "parse-torrent": "^6.1.2", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#2ce5aebff836e1dcc48459ae7b930b8661c979ec", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#ed1772a7ad0014101d79dca7fa01269eb041a0b6", "pg": "^7.8.2", "pg-hstore": "^2.3.2", "real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#935a5c23ae809edbcd2a111526a7f74d6767c50d", diff --git a/scraper/lib/metadata.js b/scraper/lib/metadata.js index 88a1127..f9ee295 100644 --- a/scraper/lib/metadata.js +++ b/scraper/lib/metadata.js @@ -5,7 +5,7 @@ const bing = require('nodejs-bing'); const he = require('he'); const { cacheWrapImdbId, cacheWrapKitsuId, cacheWrapMetadata } = require('./cache'); const { Type } = require('./types'); -const { getRandomUserAgent } = require('./request_helper'); +const { getRandomUserAgent } = require('./requestHelper'); const CINEMETA_URL = 'https://v3-cinemeta.strem.io'; const KITSU_URL = 'https://anime-kitsu.strem.fun'; @@ -18,24 +18,23 @@ function getMetadata(id, type = Type.SERIES) { const key = Number.isInteger(id) || id.match(/^\d+$/) ? `kitsu:${id}` : id; const metaType = type === Type.MOVIE ? Type.MOVIE : Type.SERIES; - return cacheWrapMetadata(key, - () => _requestMetadata(`${KITSU_URL}/meta/${metaType}/${key}.json`) - .catch(() => _requestMetadata(`${CINEMETA_URL}/meta/${metaType}/${key}.json`)) - .catch(() => { - // try different type in case there was a mismatch - const otherType = metaType === Type.MOVIE ? Type.SERIES : Type.MOVIE; - return _requestMetadata(`${CINEMETA_URL}/meta/${otherType}/${key}.json`) - }) - .catch((error) => { - throw new Error(`failed metadata query ${key} due: ${error.message}`); - })); + return cacheWrapMetadata(key, () => _requestMetadata(`${KITSU_URL}/meta/${metaType}/${key}.json`) + .catch(() => _requestMetadata(`${CINEMETA_URL}/meta/${metaType}/${key}.json`)) + .catch(() => { + // try different type in case there was a mismatch + const otherType = metaType === Type.MOVIE ? Type.SERIES : Type.MOVIE; + return _requestMetadata(`${CINEMETA_URL}/meta/${otherType}/${key}.json`) + }) + .catch((error) => { + throw new Error(`failed metadata query ${key} due: ${error.message}`); + })); } function _requestMetadata(url) { return needle('get', url, { open_timeout: TIMEOUT }) .then((response) => { const body = response.body; - if (body && body.meta && body.meta.id) { + if (body && body.meta && (body.meta.imdb_id || body.meta.kitsu_id)) { return { kitsuId: body.meta.kitsu_id, imdbId: body.meta.imdb_id, diff --git a/scraper/lib/parseHelper.js b/scraper/lib/parseHelper.js new file mode 100644 index 0000000..eb79dde --- /dev/null +++ b/scraper/lib/parseHelper.js @@ -0,0 +1,65 @@ +const { parse } = require('parse-torrent-title'); +const { Type } = require('./types'); + +function parseSeriesVideos(torrent, videos) { + const parsedTorrentName = parse(torrent.title); + const hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/i); + const parsedVideos = videos.map(video => parseSeriesVideo(video, parsedTorrentName)); + return parsedVideos.map(video => ({ ...video, isMovie: isMovieVideo(video, parsedVideos, torrent.type, hasMovies) })); +} + +function parseSeriesVideo(video, parsedTorrentName) { + const videoInfo = parse(video.name); + // the episode may be in a folder containing season number + if (!videoInfo.season && video.path.includes('/')) { + const folders = video.path.split('/'); + const pathInfo = parse(folders[folders.length - 2]); + videoInfo.season = pathInfo.season; + } + if (!videoInfo.season && parsedTorrentName.season) { + videoInfo.season = parsedTorrentName.season; + } + if (!videoInfo.season && videoInfo.seasons && videoInfo.seasons.length > 1) { + // in case single file was interpreted as having multiple seasons + videoInfo.season = videoInfo.seasons[0]; + } + // sometimes video file does not have correct date format as in torrent title + if (!videoInfo.episodes && !videoInfo.date && parsedTorrentName.date) { + videoInfo.date = parsedTorrentName.date; + } + // force episode to any found number if it was not parsed + if (!videoInfo.episodes && !videoInfo.date) { + const epMatcher = videoInfo.title.match( + /(? 3 + && otherVideos.filter(other => other.title === video.title && other.year === video.year) < 3; +} + +module.exports = { parseSeriesVideos } \ No newline at end of file diff --git a/scraper/lib/request_helper.js b/scraper/lib/requestHelper.js similarity index 100% rename from scraper/lib/request_helper.js rename to scraper/lib/requestHelper.js diff --git a/scraper/lib/torrentFiles.js b/scraper/lib/torrentFiles.js index b4f9316..44b2561 100644 --- a/scraper/lib/torrentFiles.js +++ b/scraper/lib/torrentFiles.js @@ -4,6 +4,7 @@ const { parse } = require('parse-torrent-title'); const Promises = require('../lib/promises'); const { torrentFiles } = require('../lib/torrent'); const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata'); +const { parseSeriesVideos } = require('../lib/parseHelper'); const { Type } = require('./types'); const { isDisk } = require('./extension'); @@ -12,8 +13,8 @@ const MULTIPLE_FILES_SIZE = 4 * 1024 * 1024 * 1024; // 4 GB async function parseTorrentFiles(torrent) { const parsedTorrentName = parse(torrent.title); - parsedTorrentName.hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/i); const metadata = await getMetadata(torrent.kitsuId || torrent.imdbId, torrent.type || Type.MOVIE) + .then(meta => Object.assign({}, meta)) .catch(() => undefined); // if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) { @@ -66,9 +67,8 @@ async function parseMovieFiles(torrent, parsedName, metadata) { async function parseSeriesFiles(torrent, parsedName, metadata) { const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent, parsedName); const parsedVideos = await Promise.resolve(videos) - .then(videos => videos - .filter(video => video.size > MIN_SIZE) - .map(video => parseSeriesFile(video, parsedName, torrent.type))) + .then(videos => videos.filter(video => videos.length === 1 || video.size > MIN_SIZE)) + .then(videos => parseSeriesVideos(torrent, videos)) .then(videos => decomposeEpisodes(torrent, videos, metadata)) .then(videos => assignKitsuOrImdbEpisodes(torrent, videos, metadata)) .then(videos => Promise.all(videos.map(video => video.isMovie @@ -96,7 +96,7 @@ async function getMoviesTorrentContent(torrent, parsedName) { } async function getSeriesTorrentContent(torrent, parsedName) { - const hasMultipleEpisodes = parsedName.complete || parsedName.hasMovies || torrent.size > MULTIPLE_FILES_SIZE || + const hasMultipleEpisodes = parsedName.complete || torrent.size > MULTIPLE_FILES_SIZE || (parsedName.seasons && parsedName.seasons.length > 1); const hasSingleEpisode = Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date); return torrentFiles(torrent) @@ -138,7 +138,7 @@ async function mapSeriesEpisode(file, torrent, files) { async function mapSeriesMovie(file, torrent) { const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined; const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined; - const metadata = getMetadata(imdbId, Type.MOVIE).catch(() => undefined); + const metadata = await getMetadata(kitsuId || imdbId, Type.MOVIE).catch(() => undefined); return [{ infoHash: torrent.infoHash, fileIndex: file.fileIndex, @@ -149,39 +149,6 @@ async function mapSeriesMovie(file, torrent) { }]; } -function parseSeriesFile(file, parsedTorrentName, type) { - const fileInfo = parse(file.name); - // the episode may be in a folder containing season number - if (!fileInfo.season && file.path.includes('/')) { - const folders = file.path.split('/'); - const pathInfo = parse(folders[folders.length - 2]); - fileInfo.season = pathInfo.season; - } - if (!fileInfo.season && parsedTorrentName.season) { - fileInfo.season = parsedTorrentName.season; - } - if (!fileInfo.season && fileInfo.seasons && fileInfo.seasons.length > 1) { - // in case single file was interpreted as having multiple seasons - fileInfo.season = fileInfo.seasons[0]; - } - // sometimes video file does not have correct date format as in torrent title - if (!fileInfo.episodes && !fileInfo.date && parsedTorrentName.date) { - fileInfo.date = parsedTorrentName.date; - } - // force episode to any found number if it was not parsed - if (!fileInfo.episodes && !fileInfo.date) { - const epMatcher = fileInfo.title.match( - /(? !file.episodes && !file.date)) { return files; @@ -413,7 +380,8 @@ async function updateToCinemetaMetadata(metadata) { metadata.episodeCount = newMetadata.episodeCount; metadata.totalCount = newMetadata.totalCount; return metadata; - }); + }) + .catch(error => console.warn(`Failed ${metadata.imdbId} metadata cinemeta update due: ${error.message}`)); } function findMovieImdbId(title) { diff --git a/scraper/manual/manual.js b/scraper/manual/manual.js index 354c4d1..53c76b9 100644 --- a/scraper/manual/manual.js +++ b/scraper/manual/manual.js @@ -1,6 +1,6 @@ const Bottleneck = require('bottleneck'); const { parse } = require('parse-torrent-title'); -const Promises = require('../lib/promises'); +const { mostCommonValue } = require('../lib/promises'); const repository = require('../lib/repository'); const { getImdbId } = require('../lib/metadata'); const { parseTorrentFiles } = require('../lib/torrentFiles'); @@ -67,7 +67,7 @@ async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) { path: file.title, size: file.size })); - const imdbId = storedFiles.length && storedFiles[0].imdbId || await getImdbId(parse(torrent.title)); + const imdbId = mostCommonValue(storedFiles.map(file => file.imdbId)) || await getImdbId(parse(torrent.title)); return parseTorrentFiles({ ...torrent.get(), imdbId, files }) .then(torrentContents => torrentContents.videos) @@ -113,7 +113,7 @@ async function assignSubs() { } async function openTorrentContents() { - const limiter = new Bottleneck({ maxConcurrent: 5 }); + const limiter = new Bottleneck({ maxConcurrent: 15 }); const unopenedTorrents = await repository.getNoContentsTorrents(); return Promise.all(unopenedTorrents.map(torrent => limiter.schedule(() => createTorrentContents(torrent)))) @@ -195,9 +195,9 @@ async function findAllFiles() { //findAllFiles().then(() => console.log('Finished')); //updateMovieCollections().then(() => console.log('Finished')); -// reapplyEpisodeDecomposing('0b6c0f0692bdb151efb87e3de90e46e3b177444e', false).then(() => console.log('Finished')); +reapplyEpisodeDecomposing('3598d561d632c7a6be23fd9245f7323f89ca0ee8', false).then(() => console.log('Finished')); //reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); //reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished')); //reapplyManualHashes().then(() => console.log('Finished')); // assignSubs().then(() => console.log('Finished')); -openTorrentContents().then(() => console.log('Finished')); \ No newline at end of file +// openTorrentContents().then(() => console.log('Finished')); \ No newline at end of file diff --git a/scraper/scheduler/scrapers.js b/scraper/scheduler/scrapers.js index 7bbcb4d..39e6147 100644 --- a/scraper/scheduler/scrapers.js +++ b/scraper/scheduler/scrapers.js @@ -5,17 +5,19 @@ const eztvScraper = require('../scrapers/eztv/eztv_scraper'); const leetxScraper = require('../scrapers/1337x/1337x_scraper'); const kickassScraper = require('../scrapers/kickass/kickass_scraper'); const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper'); +const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper'); module.exports = [ { scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: eztvScraper, name: eztvScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: horribleSubsScraper, name: horribleSubsScraper.NAME, cron: '0 0 */4 ? * *' }, + { scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */2 ? * *' }, { scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' }, { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' }, // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') } - // { scraper: require('../scrapers/1337x/1337x_dump_scraper') } + // { scraper: require('../scrapers/1337x/1337x_search_scraper') } // { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') } // { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') } // { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') } diff --git a/scraper/scrapers/1337x/1337x_api.js b/scraper/scrapers/1337x/1337x_api.js index f6d0626..52bad3a 100644 --- a/scraper/scrapers/1337x/1337x_api.js +++ b/scraper/scrapers/1337x/1337x_api.js @@ -4,7 +4,7 @@ const Sugar = require('sugar-date'); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); const { escapeHTML } = require('../../lib/metadata'); -const { getRandomUserAgent, defaultOptionsWithProxy } = require('../../lib/request_helper'); +const { getRandomUserAgent, defaultOptionsWithProxy } = require('../../lib/requestHelper'); const defaultProxies = [ 'https://1337x.to' diff --git a/scraper/scrapers/eztv/eztv_api.js b/scraper/scrapers/eztv/eztv_api.js index b33e8bb..256a4d5 100644 --- a/scraper/scrapers/eztv/eztv_api.js +++ b/scraper/scrapers/eztv/eztv_api.js @@ -2,7 +2,7 @@ const cheerio = require('cheerio'); const needle = require('needle'); const moment = require('moment'); const Promises = require('../../lib/promises'); -const { getRandomUserAgent } = require('./../../lib/request_helper'); +const { getRandomUserAgent } = require('./../../lib/requestHelper'); const defaultProxies = [ 'https://eztv.io' diff --git a/scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js b/scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js new file mode 100644 index 0000000..a7de428 --- /dev/null +++ b/scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js @@ -0,0 +1,63 @@ +const { pantsu } = require('nyaapi') + +const Categories = { + ANIME: { + ALL: '3_', + ENGLISH: '3_5', + RAW: '3_6', + MUSIC_VIDEO: '3_12', + NON_ENGLISH: '3_13', + }, + LIVE_ACTION: { + ALL: '5_', + ENGLISH: '5_9', + RAW: '5_11', + PROMOTIONAL_VIDEO: '5_10', + NON_ENGLISH: '5_18', + } +} + +function torrent(torrentId) { + if (!torrentId) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + + return pantsu.infoRequest(torrentId) + .then(result => parseTorrent(result)); +} + +function search(query) { + return pantsu.search(query) + .then(results => results.map(torrent => parseTorrent(torrent))); +} + +function browse(config = {}) { + const page = config.page || 1; + const category = config.category || Categories.ANIME.ENGLISH; + + return pantsu.list(category, page) + .then(results => results.map(torrent => parseTorrent(torrent))); +} + +function parseTorrent(torrent) { + return { + title: torrent.name.replace(/\t|\s+/g, ' ').trim(), + torrentId: torrent.id, + infoHash: torrent.hash.trim().toLowerCase(), + magnetLink: torrent.magnet, + torrentLink: torrent.torrent, + seeders: torrent.seeders, + size: torrent.filesize, + uploadDate: new Date(torrent.date), + category: `${torrent.category}_${torrent.sub_category}`, + languages: torrent.languages ? torrent.languages.join(',') : undefined, + files: torrent.file_list && torrent.file_list.length ? torrent.file_list.map((file, fileId) => ({ + fileIndex: fileId, + name: file.path.replace(/([^\/]+$)/, '$1'), + path: file.path, + size: file.filesize + })) : undefined + } +} + +module.exports = { torrent, search, browse, Categories }; diff --git a/scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js b/scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js new file mode 100644 index 0000000..8fa5281 --- /dev/null +++ b/scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js @@ -0,0 +1,96 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const pantsu = require('./nyaa_pantsu_api'); +const { Type } = require('../../lib/types'); +const Promises = require('../../lib/promises'); +const repository = require('../../lib/repository'); +const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent'); +const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries'); + +const NAME = 'NyaaPantsu'; +const UNTIL_PAGE = 5 + +const limiter = new Bottleneck({ maxConcurrent: 5 }); + +async function scrape() { + const scrapeStart = moment(); + const lastScrape = await repository.getProvider({ name: NAME }); + console.log(`[${scrapeStart}] starting ${NAME} scrape...`); + + // const ids = ['1033095']; + // return Promise.all(ids.map(id => limiter.schedule(() => pantsu.torrent(id) + // .then(torrent => processTorrentRecord(torrent))))) + // .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); + return scrapeLatestTorrents() + .then(() => { + lastScrape.lastScraped = scrapeStart; + return lastScrape.save(); + }) + .then(() => console.log(`[${moment()}] finished ${NAME} scrape`)); +} + +async function updateSeeders(torrent) { + return limiter.schedule(() => pantsu.torrent(torrent.torrentId)) + .then(foundTorrent => { + if (Number.isInteger(foundTorrent.seeders)) { + return [foundTorrent]; + } + return [] + }); +} + +async function scrapeLatestTorrents() { + const allowedCategories = [ + pantsu.Categories.ANIME.ENGLISH + ]; + + return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category))) + .then(entries => entries.reduce((a, b) => a.concat(b), [])); +} + +async function scrapeLatestTorrentsForCategory(category, page = 1) { + console.log(`Scrapping ${NAME} ${category} category page ${page}`); + return pantsu.browse(({ page })) + .catch(error => { + console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error); + return Promise.resolve([]); + }) + .then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent) + .catch(error => { + console.warn(`Failed processing [${torrent.infoHash}] ${torrent.title} due: `, error); + return Promise.resolve(); + }))))) + .then(resolved => resolved.length > 0 && page < UNTIL_PAGE + ? scrapeLatestTorrentsForCategory(category, page + 1) + : Promise.resolve()); +} + +async function processTorrentRecord(record) { + if (!record || await checkAndUpdateTorrent(record)) { + return record; + } + + if (!record.size) { + await updateTorrentSize(record) + } + if (record.seeders === null || record.seeders === undefined) { + await updateCurrentSeeders(record); + } + + const torrent = { + infoHash: record.infoHash, + provider: NAME, + torrentId: record.torrentId, + title: record.title, + type: Type.ANIME, + size: record.size, + seeders: record.seeders, + uploadDate: record.uploadDate, + languages: record.languages, + files: record.files || undefined + }; + + return createTorrentEntry(torrent).then(() => torrent); +} + +module.exports = { scrape, updateSeeders, NAME }; \ No newline at end of file diff --git a/scraper/scrapers/yts/yts_api.js b/scraper/scrapers/yts/yts_api.js index 9601ef0..0e71f61 100644 --- a/scraper/scrapers/yts/yts_api.js +++ b/scraper/scrapers/yts/yts_api.js @@ -1,6 +1,6 @@ const needle = require('needle'); const Promises = require('../../lib/promises'); -const { getRandomUserAgent } = require('./../../lib/request_helper'); +const { getRandomUserAgent } = require('./../../lib/requestHelper'); const defaultProxies = [ 'https://yts.mx'