diff --git a/index.js b/index.js index 2fe3842..f701793 100644 --- a/index.js +++ b/index.js @@ -5,7 +5,7 @@ const { connect } = require('./lib/repository'); const tpbDump = require('./scrapers/piratebay_dump'); const horribleSubsScraper = require('./scrapers/horiblesubs_scraper'); -const providers = [horribleSubsScraper]; +const providers = [tpbDump]; async function scrape() { providers.forEach((provider) => provider.scrape()); diff --git a/lib/cache.js b/lib/cache.js new file mode 100644 index 0000000..23ad848 --- /dev/null +++ b/lib/cache.js @@ -0,0 +1,32 @@ +const cacheManager = require('cache-manager'); + +const GLOBAL_KEY_PREFIX = 'stremio-torrentio'; +const IMDB_ID_PREFIX = `${GLOBAL_KEY_PREFIX}|imdb_id`; +const METADATA_PREFIX = `${GLOBAL_KEY_PREFIX}|metadata`; + +const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days + + +const cache = initiateCache(); + +function initiateCache() { + return cacheManager.caching({ + store: 'memory', + ttl: GLOBAL_TTL + }); +} + +function cacheWrap(key, method, options) { + return cache.wrap(key, method, options); +} + +function cacheWrapImdbId(key, method) { + return cacheWrap(`${IMDB_ID_PREFIX}:${key}`, method, { ttl: GLOBAL_TTL }); +} + +function cacheWrapMetadata(id, method) { + return cacheWrap(`${METADATA_PREFIX}:${id}`, method, { ttl: GLOBAL_TTL }); +} + +module.exports = { cacheWrapImdbId, cacheWrapMetadata }; + diff --git a/lib/metadata.js b/lib/metadata.js index 77721ab..b6a0591 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -1,32 +1,38 @@ -const _ = require('lodash'); const needle = require('needle'); const nameToImdb = require('name-to-imdb'); +const bing = require('nodejs-bing'); +const { cacheWrapImdbId, cacheWrapMetadata } = require('./cache'); const CINEMETA_URL = 'https://v3-cinemeta.strem.io'; function getMetadata(imdbId, type) { - return needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 1000 }) - .then((response) => response.body) - .then((body) => { - if (body && body.meta && body.meta.name) { - return { - imdbId: imdbId, - title: body.meta.name, - year: body.meta.year, - genres: body.meta.genres, - episodeCount: body.meta.videos && _.chain(body.meta.videos) - .countBy('season') - .toPairs() - .filter((pair) => pair[0] !== '0') - .sortBy((pair) => parseInt(pair[0], 10)) - .map((pair) => pair[1]) - .value() - }; - } else { - console.log(`failed cinemeta query: Empty Body`); - throw new Error('failed cinemeta query'); - } - }); + return cacheWrapMetadata(imdbId, + () => needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 60000 }) + .then((response) => { + const body = response.body; + if (body && body.meta && body.meta.name) { + return { + imdbId: imdbId, + title: body.meta.name, + year: body.meta.year, + genres: body.meta.genres, + totalEpisodes: body.meta.videos && body.meta.videos + .filter(video => video.season > 0).length, + episodeCount: body.meta.videos && Object.values(body.meta.videos + .filter((entry) => entry.season !== 0) + .sort((a, b) => a.season - b.season) + .reduce((map, next) => { + map[next.season] = map[next.season] + 1 || 1; + return map; + }, {})) + }; + } else { + throw new Error('No search results'); + } + }) + .catch((error) => { + throw new Error(`failed cinemeta query ${imdbId} due: ${error.message}`); + })); } function escapeTitle(title, hyphenEscape = true) { @@ -39,48 +45,23 @@ function escapeTitle(title, hyphenEscape = true) { .trim(); } -async function seriesMetadata(id) { - const idInfo = id.split(':'); - const imdbId = idInfo[0]; - const season = parseInt(idInfo[1], 10); - const episode = parseInt(idInfo[2], 10); - - const metadata = await getMetadata(imdbId, 'series'); - const title = escapeTitle(metadata.title); - - return { - imdb: imdbId, - title: hardcodedTitles[imdbId] || title, - season: season, - episode: episode, - absoluteEpisode: hasEpisodeCount && metadata.episodeCount.slice(0, season - 1).reduce((a, b) => a + b, episode), - genres: metadata.genres, - isAnime: !metadata.genres.length || metadata.genres.includes('Animation') - }; -} - -async function movieMetadata(id) { - const metadata = await getMetadata(id, 'movie'); - - return { - imdb: id, - title: escapeTitle(metadata.title), - year: metadata.year, - genres: metadata.genres, - isAnime: !metadata.genres.length || metadata.genres.includes('Animation') - }; -} - async function getImdbId(info) { - return new Promise((resolve, reject) => { - nameToImdb(info, function(err, res) { - if (res) { - resolve(res); - } else { - reject(err || new Error('failed imdbId search')); - } - }); - }); + const key = `${info.name}_${info.year}_${info.type}`; + + return cacheWrapImdbId(key, + () => new Promise((resolve, reject) => { + nameToImdb(info, function(err, res) { + if (res) { + resolve(res); + } else { + reject(err || new Error('failed imdbId search')); + } + }); + }).catch(() => bing.web(`${info.name} ${info.year || ''} ${info.type} imdb`) + .then((results) => results + .map((result) => result.link) + .find(result => result.includes('imdb.com/title/')) + .match(/imdb\.com\/title\/(tt\d+)/)[1]))); } -module.exports = { escapeTitle, getMetadata, movieMetadata, seriesMetadata, getImdbId }; +module.exports = { escapeTitle, getMetadata, getImdbId }; diff --git a/lib/repository.js b/lib/repository.js index d10dd4c..a2217d7 100644 --- a/lib/repository.js +++ b/lib/repository.js @@ -1,4 +1,5 @@ const { Sequelize }= require('sequelize'); +const Op = Sequelize.Op; const POSTGRES_URI = process.env.POSTGRES_URI || 'postgres://torrentio:postgres@localhost:5432/torrentio'; @@ -13,20 +14,42 @@ const Torrent = database.define('torrent', { infoHash: { type: Sequelize.STRING(64), primaryKey: true }, provider: { type: Sequelize.STRING(32), allowNull: false }, title: { type: Sequelize.STRING(128), allowNull: false }, + size: { type: Sequelize.BIGINT }, type: { type: Sequelize.STRING(16), allowNull: false }, - imdbId: { type: Sequelize.STRING(12) }, - kitsuId: { type: Sequelize.INTEGER }, uploadDate: { type: Sequelize.DATE, allowNull: false }, - seeders: { type: Sequelize.SMALLINT }, - files: { type: Sequelize.JSONB } + seeders: { type: Sequelize.SMALLINT } }); +const File = database.define('file', + { + id: { type: Sequelize.BIGINT, autoIncrement: true, primaryKey: true }, + infoHash: { type: Sequelize.STRING(64), allowNull: false, references: { model: Torrent, key: 'infoHash' }, onDelete: 'CASCADE' }, + fileIndex: { type: Sequelize.INTEGER }, + title: { type: Sequelize.STRING(128), allowNull: false }, + size: { type: Sequelize.BIGINT }, + imdbId: { type: Sequelize.STRING(12) }, + imdbSeason: { type: Sequelize.INTEGER }, + imdbEpisode: { type: Sequelize.INTEGER }, + kitsuId: { type: Sequelize.INTEGER }, + kitsuEpisode: { type: Sequelize.INTEGER } + }, + { + indexes:[ + { unique: true, fields:['infoHash'], where: { fileIndex: { [Op.eq]: null } } }, + { unique: true, fields:['infoHash', 'fileIndex', 'imdbEpisode'] }, + { unique: false, fields:['imdbId', 'imdbSeason', 'imdbEpisode'] }, + { unique: false, fields:['kitsuId', 'kitsuEpisode'] } + ] + } +); + const SkipTorrent = database.define('skip_torrent', { infoHash: {type: Sequelize.STRING(64), primaryKey: true}, }); const FailedImdbTorrent = database.define('failed_imdb_torrent', { infoHash: {type: Sequelize.STRING(64), primaryKey: true}, + title: { type: Sequelize.STRING(128), allowNull: false } }); function connect() { @@ -34,7 +57,7 @@ function connect() { } function getProvider(provider) { - return Provider.findOrCreate({ where: { name: provider.name }, defaults: provider }); + return Provider.findOrCreate({ where: { name: { [Op.eq]: provider.name }}, defaults: provider }); } function updateProvider(provider) { @@ -51,10 +74,14 @@ function getTorrent(torrent) { }) } -function updateTorrent(torrent) { +function createTorrent(torrent) { return Torrent.upsert(torrent); } +function createFile(file) { + return File.upsert(file); +} + function getSkipTorrent(torrent) { return SkipTorrent.findByPk(torrent.infoHash) .then((result) =>{ @@ -80,7 +107,7 @@ function getFailedImdbTorrent(torrent) { } function createFailedImdbTorrent(torrent) { - return FailedImdbTorrent.upsert({ infoHash: torrent.infoHash }); + return FailedImdbTorrent.upsert(torrent); } -module.exports = { connect, getProvider, updateProvider, getTorrent, updateTorrent, getSkipTorrent, createSkipTorrent, createFailedImdbTorrent }; \ No newline at end of file +module.exports = { connect, getProvider, updateProvider, getTorrent, createTorrent, createFile, getSkipTorrent, createSkipTorrent, createFailedImdbTorrent }; \ No newline at end of file diff --git a/lib/torrent.js b/lib/torrent.js index c1c3567..03af28a 100644 --- a/lib/torrent.js +++ b/lib/torrent.js @@ -46,7 +46,7 @@ function filesFromKat(infoHash) { if (!infoHash) { return Promise.reject(new Error("no infoHash")); } - const url = `http://kat.rip/torrent/${infoHash}.html`; + const url = `https://kat.rip/torrent/${infoHash}.html`; return needle('get', url, { open_timeout: 2000 }) .then((response) => { if (!response.body || response.statusCode !== 200) { @@ -61,6 +61,7 @@ function filesFromKat(infoHash) { $('table[id=\'ul_top\'] tr').each((index, row) => { files.push({ fileIndex: index, + name: $(row).find('td[class=\'torFileName\']').text().replace(/.*\//, ''), path: $(row).find('td[class=\'torFileName\']').text(), size: convertToBytes($(row).find('td[class=\'torFileSize\']').text()) }); @@ -139,24 +140,22 @@ function convertToBytes(sizeString) { function dynamicTimeout(torrent) { if (torrent.seeders < 5) { - return 3000; - } else if (torrent.seeders < 10) { - return 4000; - } else if (torrent.seeders < 20) { return 5000; - } else if (torrent.seeders < 30) { + } else if (torrent.seeders < 10) { return 7000; - } else if (torrent.seeders < 50) { - return 9000; - } else if (torrent.seeders < 100) { - return 12000; - } else { + } else if (torrent.seeders < 20) { + return 10000; + } else if (torrent.seeders < 30) { return 15000; + } else if (torrent.seeders < 50) { + return 20000; + } else { + return 30000; } } function getTrackerList() { - return needle('get', 'https://torrents.me/tracker-list/', { open_timeout: 2000 }) + return needle('get', 'https://torrents.me/tracker-list/', { open_timeout: 2000, follow_max: 2 }) .then((response) => { if (!response.body || response.statusCode !== 200) { throw new Error('tracker list not found') diff --git a/lib/torrentFiles.js b/lib/torrentFiles.js new file mode 100644 index 0000000..7318664 --- /dev/null +++ b/lib/torrentFiles.js @@ -0,0 +1,82 @@ +const { torrentFiles } = require('../lib/torrent'); +const { getMetadata } = require('../lib/metadata'); +const { parse } = require('parse-torrent-title'); +const { Type } = require('./types'); + +const MIN_SIZE = 20 * 1024 * 1024; // 20 MB + +async function parseTorrentFiles(torrent, imdbId) { + if (torrent.type === Type.MOVIE) { + return [{ + infoHash: torrent.infoHash, + title: torrent.title, + size: torrent.size, + imdbId: imdbId, + }]; + } + const parsedTorrentName = parse(torrent.title); + if (parsedTorrentName.season && parsedTorrentName.episode) { + return [{ + infoHash: torrent.infoHash, + title: torrent.title, + size: torrent.size, + imdbId: imdbId, + imdbSeason: parsedTorrentName.season, + imdbEpisode: parsedTorrentName.episode + }]; + } + + return torrentFiles(torrent) + .then(files => files + .filter(file => file.size > MIN_SIZE) + .map(file => parseFile(file, parsedTorrentName))) + .then(files => decomposeAbsoluteEpisodes(files, torrent, imdbId)) + .then(files => files + .filter(file => file.season && file.episodes && file.episodes.length) + .map(file => file.episodes.map(episode => ({ + infoHash: torrent.infoHash, + fileIndex: file.fileIndex, + title: file.name, + size: file.size, + imdbId: imdbId, + imdbSeason: file.season, + imdbEpisode: episode}))) + .reduce((a, b) => a.concat(b), [])) + .catch(error => { + console.log(`Failed getting files for ${torrent.title}`, error.message); + return []; + }); +} + +function parseFile(file, parsedTorrentName) { + const fileInfo = parse(file.name); + // the episode may be in a folder containing season number + if (!fileInfo.season && parsedTorrentName.season) { + fileInfo.season = parsedTorrentName.season; + } else if (!fileInfo.season && file.path.includes('/')) { + const folders = file.path.split('/'); + const pathInfo = parse(folders[folders.length - 2]); + fileInfo.season = pathInfo.season; + } + + return { ...file, ...fileInfo }; +} + +async function decomposeAbsoluteEpisodes(files, torrent, imdbId) { + if (files.every((file) => file.episodes.every((ep) => ep < 100))) { + return; // nothing to decompose + } + + const metadata = await getMetadata(imdbId, torrent.type || Type.MOVIE); + // decompose if season is inside path, but individual files are concatenated ex. 101 (S01E01) + files + .filter(file => file.season && metadata.episodeCount[file.season] < 100) + .filter(file => file.episodes.every(ep => ep / 100 === file.season)) + .forEach(file => file.episodes = file.episodes.map(ep => ep % 100)); + // decompose if no season info is available, but individual files are concatenated ex. 101 (S01E01) + // based on total episodes count per season + + return files; +} + +module.exports = { parseTorrentFiles }; \ No newline at end of file diff --git a/lib/types.js b/lib/types.js new file mode 100644 index 0000000..2f37a8e --- /dev/null +++ b/lib/types.js @@ -0,0 +1,5 @@ +exports.Type = { + MOVIE: 'movie', + SERIES: 'series', + ANIME: 'anime' +}; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index e682689..f2146a3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,6 +14,30 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-11.11.0.tgz", "integrity": "sha512-D5Rt+HXgEywr3RQJcGlZUCTCx1qVbCZpVk3/tOOA6spLNZdGm8BU+zRgdRYDoF1pO3RuXLxADzMrF903JlQXqg==" }, + "CSSselect": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/CSSselect/-/CSSselect-0.4.1.tgz", + "integrity": "sha1-+Kt+H4QYzmPNput713ioXX7EkrI=", + "requires": { + "CSSwhat": "0.4", + "domutils": "1.4" + }, + "dependencies": { + "domutils": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.4.3.tgz", + "integrity": "sha1-CGVRN5bGswYDGFDhdVFrr4C3Km8=", + "requires": { + "domelementtype": "1" + } + } + } + }, + "CSSwhat": { + "version": "0.4.7", + "resolved": "https://registry.npmjs.org/CSSwhat/-/CSSwhat-0.4.7.tgz", + "integrity": "sha1-hn2g/zn3eGEyQsRM/qg/CqTr35s=" + }, "accepts": { "version": "1.3.5", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", @@ -245,6 +269,15 @@ "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" }, + "boom": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/boom/-/boom-0.4.2.tgz", + "integrity": "sha1-emNune1O/O+xnO9JR6PGffrukRs=", + "optional": true, + "requires": { + "hoek": "0.9.x" + } + }, "bottleneck": { "version": "2.17.1", "resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.17.1.tgz", @@ -308,6 +341,22 @@ "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=" }, + "cache-manager": { + "version": "2.10.1", + "resolved": "https://registry.npmjs.org/cache-manager/-/cache-manager-2.10.1.tgz", + "integrity": "sha512-bk17v9IkLqNcbCzggEh82LEJhjHp+COnL57L7a0ESbM/cOuXIIBatdVjD/ps7vOsofI48++zAC14Ye+8v50flg==", + "requires": { + "async": "1.5.2", + "lru-cache": "4.0.0" + }, + "dependencies": { + "async": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/async/-/async-1.5.2.tgz", + "integrity": "sha1-7GphrlZIDAw8skHJVhjiCJL5Zyo=" + } + } + }, "caseless": { "version": "0.12.0", "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", @@ -400,6 +449,15 @@ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" }, + "cryptiles": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/cryptiles/-/cryptiles-0.2.2.tgz", + "integrity": "sha1-7ZH/HxetE9N0gohZT4pIoNJvMlw=", + "optional": true, + "requires": { + "boom": "0.4.x" + } + }, "css-select": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", @@ -416,6 +474,12 @@ "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz", "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==" }, + "ctype": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/ctype/-/ctype-0.5.3.tgz", + "integrity": "sha1-gsGMJGH3QRTvFsE1IkrQuRRMoS8=", + "optional": true + }, "cyclist": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/cyclist/-/cyclist-0.1.1.tgz", @@ -429,6 +493,15 @@ "assert-plus": "^1.0.0" } }, + "ddg-scraper": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/ddg-scraper/-/ddg-scraper-1.0.2.tgz", + "integrity": "sha1-YJ+aj3VFvTylll6pBxIh/zn6cCA=", + "requires": { + "cheerio": "^0.22.0", + "request": "^2.80.0" + } + }, "debug": { "version": "3.2.6", "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", @@ -738,6 +811,187 @@ "path-is-absolute": "^1.0.0" } }, + "google-search-scraper": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/google-search-scraper/-/google-search-scraper-0.1.0.tgz", + "integrity": "sha1-KZKPKJtK0goAz4DBDDVOBPv718k=", + "requires": { + "cheerio": "~0.13.1", + "request": "~2.33.0" + }, + "dependencies": { + "asn1": { + "version": "0.1.11", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.1.11.tgz", + "integrity": "sha1-VZvhg3bQik7E2+gId9J4GGObLfc=", + "optional": true + }, + "assert-plus": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.1.5.tgz", + "integrity": "sha1-7nQAlBMALYTOxyGcasgRgS5yMWA=", + "optional": true + }, + "aws-sign2": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.5.0.tgz", + "integrity": "sha1-xXED96F/wDfwLXwuZLYC6iI/fWM=", + "optional": true + }, + "cheerio": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-0.13.1.tgz", + "integrity": "sha1-SK8RNFYbNSf4PZFWxPmo69grBuw=", + "requires": { + "CSSselect": "~0.4.0", + "entities": "0.x", + "htmlparser2": "~3.4.0", + "underscore": "~1.5" + } + }, + "combined-stream": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-0.0.7.tgz", + "integrity": "sha1-ATfmV7qlp1QcV6w3rF/AfXO03B8=", + "optional": true, + "requires": { + "delayed-stream": "0.0.5" + } + }, + "delayed-stream": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-0.0.5.tgz", + "integrity": "sha1-1LH0OpPoKW3+AmlPRoC8N6MTxz8=", + "optional": true + }, + "domhandler": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.2.1.tgz", + "integrity": "sha1-Wd+dzSJ+gIs2Wuc+H2aErD2Ub8I=", + "requires": { + "domelementtype": "1" + } + }, + "domutils": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.3.0.tgz", + "integrity": "sha1-mtTVm1r2ymhMYv5tdo7xcOcN8ZI=", + "requires": { + "domelementtype": "1" + } + }, + "entities": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-0.5.0.tgz", + "integrity": "sha1-9hHLWuIhBQ4AEsZpeVA/164ZzEk=" + }, + "forever-agent": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.5.2.tgz", + "integrity": "sha1-bQ4JxJIflKJ/Y9O0nF/v8epMUTA=" + }, + "form-data": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-0.1.4.tgz", + "integrity": "sha1-kavXiKupcCsaq/qLwBAxoqyeOxI=", + "optional": true, + "requires": { + "async": "~0.9.0", + "combined-stream": "~0.0.4", + "mime": "~1.2.11" + } + }, + "htmlparser2": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.4.0.tgz", + "integrity": "sha1-oc1l9YI60oXhnWOwha1yLQpR6uc=", + "requires": { + "domelementtype": "1", + "domhandler": "2.2", + "domutils": "1.3", + "readable-stream": "1.1" + } + }, + "http-signature": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-0.10.1.tgz", + "integrity": "sha1-T72sEyVZqoMjEh5UB3nAoBKyfmY=", + "optional": true, + "requires": { + "asn1": "0.1.11", + "assert-plus": "^0.1.5", + "ctype": "0.5.3" + } + }, + "mime": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.2.11.tgz", + "integrity": "sha1-WCA+7Ybjpe8XrtK32evUfwpg3RA=" + }, + "node-uuid": { + "version": "1.4.8", + "resolved": "https://registry.npmjs.org/node-uuid/-/node-uuid-1.4.8.tgz", + "integrity": "sha1-sEDrCSOWivq/jTL7HxfxFn/auQc=" + }, + "oauth-sign": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.3.0.tgz", + "integrity": "sha1-y1QPk7srIqfVlBaRoojWDo6pOG4=", + "optional": true + }, + "qs": { + "version": "0.6.6", + "resolved": "https://registry.npmjs.org/qs/-/qs-0.6.6.tgz", + "integrity": "sha1-bgFQmP9RlouKPIGQAdXyyJvEsQc=" + }, + "readable-stream": { + "version": "1.1.14", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz", + "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "0.0.1", + "string_decoder": "~0.10.x" + } + }, + "request": { + "version": "2.33.0", + "resolved": "https://registry.npmjs.org/request/-/request-2.33.0.tgz", + "integrity": "sha1-UWeHgTFyYHDsYzdS6iMKI3ncZf8=", + "requires": { + "aws-sign2": "~0.5.0", + "forever-agent": "~0.5.0", + "form-data": "~0.1.0", + "hawk": "~1.0.0", + "http-signature": "~0.10.0", + "json-stringify-safe": "~5.0.0", + "mime": "~1.2.9", + "node-uuid": "~1.4.0", + "oauth-sign": "~0.3.0", + "qs": "~0.6.0", + "tough-cookie": ">=0.12.0", + "tunnel-agent": "~0.3.0" + } + }, + "string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" + }, + "tunnel-agent": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.3.0.tgz", + "integrity": "sha1-rWgbaPUyGtKCfEz7G31d8s/pQu4=", + "optional": true + }, + "underscore": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.5.2.tgz", + "integrity": "sha1-EzXF5PXm0zu7SwBrqMhqAPVW3gg=" + } + } + }, "har-schema": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz", @@ -757,6 +1011,24 @@ "resolved": "https://registry.npmjs.org/hat/-/hat-0.0.3.tgz", "integrity": "sha1-uwFKnmSzeIrtgAWRdBPU/z1QLYo=" }, + "hawk": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/hawk/-/hawk-1.0.0.tgz", + "integrity": "sha1-uQuxaYByhUEdp//LjdJZhQLTtS0=", + "optional": true, + "requires": { + "boom": "0.4.x", + "cryptiles": "0.2.x", + "hoek": "0.9.x", + "sntp": "0.2.x" + } + }, + "hoek": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/hoek/-/hoek-0.9.1.tgz", + "integrity": "sha1-PTIkYrrfB3Fup+uFuviAec3c5QU=", + "optional": true + }, "htmlparser2": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz", @@ -1132,6 +1404,15 @@ "inherits": "^2.0.1" } }, + "lru-cache": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.0.0.tgz", + "integrity": "sha1-tcvwFVbBaWb+vlTO7A+03JDfbCg=", + "requires": { + "pseudomap": "^1.0.1", + "yallist": "^2.0.0" + } + }, "magnet-uri": { "version": "5.2.4", "resolved": "https://registry.npmjs.org/magnet-uri/-/magnet-uri-5.2.4.tgz", @@ -1294,6 +1575,17 @@ "resolved": "https://registry.npmjs.org/node-gzip/-/node-gzip-1.1.2.tgz", "integrity": "sha512-ZB6zWpfZHGtxZnPMrJSKHVPrRjURoUzaDbLFj3VO70mpLTW5np96vXyHwft4Id0o+PYIzgDkBUjIzaNHhQ8srw==" }, + "nodejs-bing": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/nodejs-bing/-/nodejs-bing-0.1.0.tgz", + "integrity": "sha1-3i/99AtBVJKFxbwMUX3VTw5fTYo=", + "requires": { + "cheerio": "^0.22.0", + "request": "^2.79.0", + "request-promise": "^4.1.1", + "urlencode": "^1.1.0" + } + }, "nth-check": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", @@ -1377,7 +1669,7 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#8cc4327ee24692a0b598842d5620b2af9c13289b", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#986c51f919c09e8f3a7937d22c8aea1fb9759090", "from": "git://github.com/TheBeastLT/parse-torrent-title.git#master" }, "parseurl": { @@ -1404,7 +1696,8 @@ "bncode": "~0.2.3", "byline": "3.1.2", "compact2string": "~1.0.0", - "hat": "0.0.3" + "hat": "0.0.3", + "needle": "github:Ivshti/needle" }, "dependencies": { "bncode": { @@ -1431,8 +1724,8 @@ "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" }, "needle": { - "version": "git+https://github.com/Ivshti/needle.git#16cceb74e24babd9c474071be1a15a09beac300d", - "from": "git+https://github.com/Ivshti/needle.git", + "version": "github:Ivshti/needle#16cceb74e24babd9c474071be1a15a09beac300d", + "from": "github:Ivshti/needle", "requires": { "debug": "^2.1.2", "iconv-lite": "^0.4.4" @@ -1602,6 +1895,11 @@ } } }, + "pseudomap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", + "integrity": "sha1-8FKijacOYYkX7wqKw0wa5aaChrM=" + }, "psl": { "version": "1.1.31", "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.31.tgz", @@ -2046,6 +2344,15 @@ } } }, + "sntp": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/sntp/-/sntp-0.2.4.tgz", + "integrity": "sha1-+4hfGLDzqtGJ+CSGJTa87ux1CQA=", + "optional": true, + "requires": { + "hoek": "0.9.x" + } + }, "speedometer": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/speedometer/-/speedometer-0.1.4.tgz", @@ -2286,6 +2593,14 @@ "punycode": "^2.1.0" } }, + "urlencode": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/urlencode/-/urlencode-1.1.0.tgz", + "integrity": "sha1-HyuibwE8hfATP3o61v8nMK33y7c=", + "requires": { + "iconv-lite": "~0.4.11" + } + }, "util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", @@ -2367,6 +2682,11 @@ "version": "4.0.1", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.1.tgz", "integrity": "sha1-pcbVMr5lbiPbgg77lDofBJmNY68=" + }, + "yallist": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", + "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=" } } } diff --git a/package.json b/package.json index 4ac42bb..09e87d9 100644 --- a/package.json +++ b/package.json @@ -12,19 +12,22 @@ "author": "TheBeastLT ", "license": "MIT", "dependencies": { + "cache-manager": "^2.9.0", "bottleneck": "^2.16.2", "cheerio": "^0.22.0", "cloudscraper": "^3.0.0", + "ddg-scraper": "^1.0.2", "express": "^4.16.4", + "google-search-scraper": "^0.1.0", "imdb": "^1.1.0", "is-video": "^1.0.1", "line-by-line": "^0.1.6", - "lodash": "^4.17.11", "magnet-uri": "^5.1.7", "moment": "^2.24.0", "name-to-imdb": "^2.3.0", "needle": "^2.2.4", "node-gzip": "^1.1.2", + "nodejs-bing": "^0.1.0", "parse-torrent": "^6.1.2", "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#master", "peer-search": "^0.6.x", diff --git a/scrapers/api/thepiratebay.js b/scrapers/api/thepiratebay.js index 680462f..13ec64e 100644 --- a/scrapers/api/thepiratebay.js +++ b/scrapers/api/thepiratebay.js @@ -2,9 +2,14 @@ const cheerio = require('cheerio'); const needle = require('needle'); const moment = require('moment'); -const defaultProxies = ['https://pirateproxy.sh', 'https://thepiratebay.org']; +const defaultProxies = [ + 'https://thepiratebay.org', + 'https://thepiratebay.vip', + 'https://proxybay.pro', + 'https://ukpiratebayproxy.com', + 'https://thepiratebayproxy.info']; const dumpUrl = '/static/dump/csv/'; -const defaultTimeout = 5000; +const defaultTimeout = 30000; const errors = { REQUEST_ERROR: { code: 'REQUEST_ERROR' }, @@ -76,6 +81,18 @@ Categories = { } }; +function torrent(torrentId, config = {}, retries = 2) { + if (!torrentId || retries === 0) { + return Promise.reject(new Error(`Failed ${torrentId} search`)); + } + const proxyList = config.proxyList || defaultProxies; + + return raceFirstSuccessful(proxyList + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config))) + .then((body) => parseTorrentPage(body)) + .catch((err) => torrent(torrentId, config, retries - 1)); +} + function search(keyword, config = {}, retries = 2) { if (!keyword || retries === 0) { return Promise.reject(new Error(`Failed ${keyword} search`)); @@ -87,7 +104,7 @@ function search(keyword, config = {}, retries = 2) { return raceFirstSuccessful(proxyList .map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config))) .then((body) => parseBody(body)) - .catch(() => search(keyword, config, retries - 1)); + .catch((err) => search(keyword, config, retries - 1)); } function dumps(config = {}, retries = 2) { @@ -109,26 +126,23 @@ function dumps(config = {}, retries = 2) { function singleRequest(requestUrl, config = {}) { const timeout = config.timeout || defaultTimeout; - return new Promise(((resolve, reject) => { - needle.get(requestUrl, - { open_timeout: timeout, follow: 2 }, - (err, res, body) => { - if (err || !body) { - reject(err || errors.REQUEST_ERROR); - } else if (body.includes('Access Denied') && !body.includes('The Pirate Bay')) { - console.log(`Access Denied: ${url}`); - reject(new Error(`Access Denied: ${url}`)); - } else if (body.includes('502: Bad gateway') || + return needle('get', requestUrl, { open_timeout: timeout, follow: 2 }) + .then((response) => { + const body = response.body; + if (!body) { + throw new Error(`No body: ${requestUrl}`); + } else if (body.includes('Access Denied') && !body.includes('<title>The Pirate Bay')) { + console.log(`Access Denied: ${requestUrl}`); + throw new Error(`Access Denied: ${requestUrl}`); + } else if (body.includes('502: Bad gateway') || body.includes('403 Forbidden') || body.includes('Database maintenance') || body.includes('Origin DNS error') || - !body.includes('<title>The Pirate Bay')) { - reject(errors.REQUEST_ERROR); - } - - resolve(body); - }); - })); + !(body.includes('<title>The Pirate Bay') || body.includes('TPB') || body.includes(dumpUrl))) { + throw new Error(`Invalid body contents: ${requestUrl}`); + } + return body; + }); } function parseBody(body) { @@ -143,7 +157,7 @@ function parseBody(body) { $('table[id=\'searchResult\'] tr').each(function() { const name = $(this).find('.detLink').text(); - if (!name) { + if (!name || name === 'Do NOT download any torrent before hiding your IP with a VPN.') { return; } torrents.push({ @@ -152,13 +166,51 @@ function parseBody(body) { leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10), magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'), category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), - subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10) + subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10), + size: parseSize($(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/)[1]) }); }); resolve(torrents); }); } +function parseTorrentPage(body) { + return new Promise((resolve, reject) => { + const $ = cheerio.load(body); + + if (!$) { + reject(new Error(errors.PARSER_ERROR)); + } + + const torrent = { + name: $('div[id=\'title\']').text().trim(), + seeders: parseInt($('dl[class=\'col2\']').find('dd').eq(2).text(), 10), + leechers: parseInt($('dl[class=\'col2\']').find('dd').eq(3).text(), 10), + magnetLink: $('div[id=\'details\']').find('a[title=\'Get this torrent\']').attr('href'), + category: Categories.VIDEO.ALL, + subcategory: parseInt($('dl[class=\'col1\']').find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), + size: parseSize($('dl[class=\'col1\']').find('dd').eq(2).text().match(/(\d+)(?:.?Bytes)/)[1]) + }; + resolve(torrent); + }); +} + +function parseSize(sizeText) { + if (!sizeText) { + return undefined; + } + if (sizeText.includes('GiB')) { + return Math.floor(parseFloat(sizeText.trim()) * 1024 * 1024 * 1024); + } + if (sizeText.includes('MiB')) { + return Math.floor(parseFloat(sizeText.trim()) * 1024 * 1024); + } + if (sizeText.includes('KiB')) { + return Math.floor(parseFloat(sizeText.trim()) * 1024); + } + return Math.floor(parseFloat(sizeText)); +} + function raceFirstSuccessful(promises) { return Promise.all(promises.map((p) => { // If a request fails, count that as a resolution so it will keep @@ -176,4 +228,4 @@ function raceFirstSuccessful(promises) { ); } -module.exports = { search, dumps, Categories }; +module.exports = { torrent, search, dumps, Categories }; diff --git a/scrapers/horiblesubs_scraper.js b/scrapers/horiblesubs_scraper.js index 6f338f8..de1bc11 100644 --- a/scrapers/horiblesubs_scraper.js +++ b/scrapers/horiblesubs_scraper.js @@ -4,6 +4,7 @@ const Bottleneck = require('bottleneck'); const { parse } = require('parse-torrent-title'); const decode = require('magnet-uri'); const horriblesubs = require('./api/horriblesubs'); +const { Type } = require('../lib/types'); const { torrentFiles, currentSeeders } = require('../lib/torrent'); const repository = require('../lib/repository'); const { getImdbId, getMetadata } = require('../lib/metadata'); @@ -72,7 +73,7 @@ async function _constructSingleEntry(metadata, single, mirror) { infoHash: mirror.infoHash, provider: NAME, title: title, - type: 'anime', + type: Type.ANIME, imdbId: metadata.imdbId, uploadDate: single.uploadDate, seeders: seeders, diff --git a/scrapers/piratebay_dump.js b/scrapers/piratebay_dump.js index 03461c9..fb6b77f 100644 --- a/scrapers/piratebay_dump.js +++ b/scrapers/piratebay_dump.js @@ -6,9 +6,11 @@ const LineByLineReader = require('line-by-line'); const fs = require('fs'); const { parse } = require('parse-torrent-title'); const pirata = require('./api/thepiratebay'); -const { torrentFiles } = require('../lib/torrent'); +const bing = require('nodejs-bing'); +const { Type } = require('../lib/types'); const repository = require('../lib/repository'); const { getImdbId, escapeTitle } = require('../lib/metadata'); +const { parseTorrentFiles } = require('../lib/torrentFiles'); const NAME = 'ThePirateBay'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; @@ -17,18 +19,27 @@ const limiter = new Bottleneck({maxConcurrent: 40}); async function scrape() { const lastScraped = await repository.getProvider({ name: NAME }); - const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); + const lastDump = { updatedAt: 2147000000 }; + //const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) { console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); //await downloadDump(lastDump); + let entriesProcessed = 0; const lr = new LineByLineReader(CSV_FILE_PATH); lr.on('line', (line) => { if (line.includes("#ADDED")) { return; } + if (entriesProcessed % 1000 === 0) { + console.log(`Processed ${entriesProcessed} entries`); + } const row = line.match(/(?<=^|;)(".*"|[^;]+)(?=;|$)/g); + if (row.length !== 4) { + console.log(`Invalid row: ${line}`); + return; + } const torrent = { uploadDate: moment(row[0], 'YYYY-MMM-DD HH:mm:ss').toDate(), infoHash: Buffer.from(row[1], 'base64').toString('hex'), @@ -50,9 +61,10 @@ async function scrape() { } limiter.schedule(() => processTorrentRecord(torrent) - .catch((error) => console.log(`failed ${torrent.title} due: ${error}`))) - .then(() => limiter.empty()) - .then((empty) => empty && lr.resume()); + .catch((error) => console.log(`failed ${torrent.title} due: ${error}`))) + .then(() => limiter.empty()) + .then((empty) => empty && lr.resume()) + .then(() => entriesProcessed++); }); lr.on('error', (err) => { console.log(err); @@ -77,80 +89,90 @@ const seriesCategories = [ pirata.Categories.VIDEO.TV_SHOWS_HD ]; async function processTorrentRecord(record) { - const persisted = await repository.getSkipTorrent(record) - .catch(() => repository.getTorrent(record)).catch(() => undefined); - if (persisted) { + const alreadyExists = await repository.getSkipTorrent(record) + .catch(() => repository.getTorrent(record)) + .catch(() => undefined); + if (alreadyExists) { return; } - let page = 0; - let torrentFound; - while (!torrentFound && page < 5) { - const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page }); - torrentFound = torrents. - filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0]; - page = torrents.length === 0 ? 1000 : page + 1; - } + const torrentFound = await findTorrent(record); if (!torrentFound) { - console.log(`not found: ${JSON.stringify(record)}`); + //console.log(`not found: ${JSON.stringify(record)}`); repository.createSkipTorrent(record); return; } if (!allowedCategories.includes(torrentFound.subcategory)) { - console.log(`wrong category: ${torrentFound.name}`); + //console.log(`wrong category: ${torrentFound.name}`); repository.createSkipTorrent(record); return; } - const type = seriesCategories.includes(torrentFound.subcategory) ? 'series' : 'movie'; - console.log(`imdbId search: ${torrentFound.name}`); + const type = seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE; const titleInfo = parse(torrentFound.name); const imdbId = await getImdbId({ name: escapeTitle(titleInfo.title).toLowerCase(), year: titleInfo.year, type: type - }).catch(() => undefined); + }).catch((error) => undefined); + const torrent = { + infoHash: record.infoHash, + provider: NAME, + title: torrentFound.name, + size: record.size, + type: type, + uploadDate: record.uploadDate, + seeders: torrentFound.seeders, + }; if (!imdbId) { console.log(`imdbId not found: ${torrentFound.name}`); - repository.updateTorrent({ - infoHash: record.infoHash, - provider: NAME, - title: torrentFound.name, - uploadDate: record.uploadDate, - seeders: torrentFound.seeders, - }); + repository.createFailedImdbTorrent(torrent); return; } - if (type === 'movie' || titleInfo.episode) { - repository.updateTorrent({ - infoHash: record.infoHash, - provider: NAME, - title: torrentFound.name, - imdbId: imdbId, - uploadDate: record.uploadDate, - seeders: torrentFound.seeders, - }); - return; - } - - const files = await torrentFiles(record).catch(() => []); + const files = await parseTorrentFiles(torrent, imdbId); if (!files || !files.length) { console.log(`no video files found: ${torrentFound.name}`); return; } - repository.updateTorrent({ - infoHash: record.infoHash, - provider: NAME, - title: torrentFound.name, - imdbId: imdbId, - uploadDate: record.uploadDate, - seeders: torrentFound.seeders, - files: files - }) + repository.createTorrent(torrent) + .then(() => files.forEach(file => repository.createFile(file))); + console.log(`Created entry for ${torrentFound.name}`); +} + +async function findTorrent(record) { + return findTorrentInSource(record) + .catch((error) => findTorrentViaBing(record)); +} + +async function findTorrentInSource(record) { + let page = 0; + let torrentFound; + while (!torrentFound && page < 5) { + const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page }); + torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0]; + page = torrents.length === 0 ? 1000 : page + 1; + } + if (!torrentFound) { + return Promise.reject(new Error(`Failed to find torrent ${record.title}`)); + } + return Promise.resolve(torrentFound); +} + +async function findTorrentViaBing(record) { + return bing.web(`${record.infoHash}`) + .then((results) => results + .find(result => result.description.includes('Direct download via magnet link') || result.description.includes('Get this torrent'))) + .then((result) => { + if (!result) { + throw new Error(`Failed to find torrent ${record.title}`); + } + return result.link.match(/torrent\/(\w+)\//)[1]; + }) + .then((torrentId) => pirata.torrent(torrentId)) } function downloadDump(dump) {