diff --git a/index.js b/index.js index a964121..2fe3842 100644 --- a/index.js +++ b/index.js @@ -1,10 +1,11 @@ const express = require("express"); const server = express(); +const { init } = require('./lib/torrent'); const { connect } = require('./lib/repository'); const tpbDump = require('./scrapers/piratebay_dump'); -const horribleSubs = require('./scrapers/api/horriblesubs'); +const horribleSubsScraper = require('./scrapers/horiblesubs_scraper'); -const providers = [tpbDump]; +const providers = [horribleSubsScraper]; async function scrape() { providers.forEach((provider) => provider.scrape()); @@ -17,12 +18,7 @@ server.post('/scrape', function(req, res) { server.listen(7000, async function () { await connect(); + await init(); console.log('Scraper started'); - // const shows = await horribleSubs.allShows(); - // console.log(shows); - // const showInfo = await horribleSubs.showData('/shows/one-piece'); - // console.log(showInfo) - // const latestEntries = await horribleSubs.getLatestEntries(); - // console.log(latestEntries); - //scrape(); + scrape(); }); \ No newline at end of file diff --git a/lib/metadata.js b/lib/metadata.js index ecc8890..939ddda 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -10,6 +10,7 @@ function getMetadata(imdbId, type) { .then((body) => { if (body && body.meta && body.meta.name) { return { + imdbId: imdbId, title: body.meta.name, year: body.meta.year, genres: body.meta.genres, @@ -38,14 +39,6 @@ function escapeTitle(title, hyphenEscape = true) { .trim(); } -const hardcodedTitles = { - 'tt0388629': 'one piece', - 'tt0182629': 'rurouni kenshin', - 'tt2098220': 'hunter x hunter 2011', - 'tt1409055': 'dragon ball kai', - 'tt7441658': 'black clover tv' -}; - async function seriesMetadata(id) { const idInfo = id.split(':'); const imdbId = idInfo[0]; @@ -54,7 +47,6 @@ async function seriesMetadata(id) { const metadata = await getMetadata(imdbId, 'series'); const title = escapeTitle(metadata.title); - const hasEpisodeCount = metadata.episodeCount && metadata.episodeCount.length >= season; return { imdb: imdbId, @@ -91,4 +83,4 @@ async function getImdbId(info) { }); } -module.exports = { movieMetadata, seriesMetadata, getImdbId }; +module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId }; diff --git a/lib/repository.js b/lib/repository.js index df4c2e0..199582a 100644 --- a/lib/repository.js +++ b/lib/repository.js @@ -5,18 +5,19 @@ const POSTGRES_URI = process.env.POSTGRES_URI || 'postgres://torrentio:postgres@ const database = new Sequelize(POSTGRES_URI, { logging: false }); const Provider = database.define('provider', { - name: { type: Sequelize.STRING(16), primaryKey: true}, + name: { type: Sequelize.STRING(32), primaryKey: true}, lastScraped: { type: Sequelize.DATE } }); const Torrent = database.define('torrent', { infoHash: { type: Sequelize.STRING(64), primaryKey: true }, - provider: { type: Sequelize.STRING(16), allowNull: false }, + provider: { type: Sequelize.STRING(32), allowNull: false }, title: { type: Sequelize.STRING(128), allowNull: false }, + type: { type: Sequelize.STRING(16), allowNull: false }, imdbId: { type: Sequelize.STRING(12) }, uploadDate: { type: Sequelize.DATE, allowNull: false }, seeders: { type: Sequelize.SMALLINT }, - files: { type: Sequelize.ARRAY(Sequelize.TEXT) }, + files: { type: Sequelize.JSONB } }); const SkipTorrent = database.define('skip_torrent', { diff --git a/lib/torrent.js b/lib/torrent.js index a663971..c1c3567 100644 --- a/lib/torrent.js +++ b/lib/torrent.js @@ -2,17 +2,36 @@ const torrentStream = require('torrent-stream'); const cheerio = require('cheerio'); const needle = require('needle'); const parseTorrent = require('parse-torrent'); -const cloudscraper = require('cloudscraper'); +const Tracker = require("peer-search/tracker"); const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"]; +let TRACKERS = []; + +module.exports.init = async function() { + TRACKERS = await getTrackerList(); +}; module.exports.torrentFiles = function(torrent) { - return filesFromKat(torrent.infoHash) + return filesFromTorrentFile(torrent) + .catch(() => filesFromKat(torrent.infoHash)) .catch(() => filesFromTorrentStream(torrent)) - .then((files) => files - .filter((file) => isVideo(file)) - .map((file) => `${file.fileIndex}@@${file.path}`)); + .then((files) => files.filter((file) => isVideo(file))); +}; + +module.exports.currentSeeders = function (torrent) { + if (!torrent.infoHash) { + return Promise.reject(new Error("no infoHash")); + } + return new Promise((resolve) => + Promise.all(TRACKERS.map((tracker) => new Promise((resolve) => { + const t = new Tracker(tracker, { }, torrent.infoHash); + t.run(); + t.on("info", (inf) => resolve([inf.seeders, inf.leechers])); + setTimeout(() => resolve([0, 0]), 1000); + })) + ).then((results) => resolve(results.reduce((seeders, next) => seeders + next[0], 0))) + ); }; // async function filesFromBtSeeds(infoHash) { @@ -24,10 +43,13 @@ module.exports.torrentFiles = function(torrent) { // } function filesFromKat(infoHash) { + if (!infoHash) { + return Promise.reject(new Error("no infoHash")); + } const url = `http://kat.rip/torrent/${infoHash}.html`; return needle('get', url, { open_timeout: 2000 }) .then((response) => { - if (!response.body) { + if (!response.body || response.statusCode !== 200) { throw new Error('torrent not found in kat') } return response.body @@ -43,13 +65,41 @@ function filesFromKat(infoHash) { size: convertToBytes($(row).find('td[class=\'torFileSize\']').text()) }); }); + + if (!files[files.length - 1].size) { + throw new Error('not full file list') + } return files; }) } +async function filesFromTorrentFile(torrent) { + if (!torrent.torrentLink) { + return Promise.reject(new Error("no torrentLink")); + } + + needle('get', torrent.torrentLink, { open_timeout: 2000 }) + .then((response) => { + if (!response.body || response.statusCode !== 200) { + throw new Error('torrent not found') + } + return response.body + }) + .then((body) => parseTorrent(body)) + .then((info) => info.files.map((file, fileId) => ({ + fileIndex: fileId, + name: file.name, + path: file.path.replace(/^[^\/]+\//, ''), + size: file.length + }))); +} + async function filesFromTorrentStream(torrent) { + if (!torrent.infoHash && !torrent.magnetLink) { + return Promise.reject(new Error("no infoHash or magnetLink")); + } return new Promise((resolve, rejected) => { - const engine = new torrentStream(torrent.infoHash, { connections: MAX_PEER_CONNECTIONS }); + const engine = new torrentStream(torrent.magnetLink || torrent.infoHash, { connections: MAX_PEER_CONNECTIONS, trackers: TRACKERS }); engine.ready(() => { const files = engine.files @@ -74,7 +124,6 @@ function isVideo(title) { return EXTENSIONS.includes(title.path.match(/\.(\w{2,4})$/)[1]); } - function convertToBytes(sizeString) { if (!sizeString) { return; @@ -88,22 +137,34 @@ function convertToBytes(sizeString) { return Math.floor(parseFloat(sizeString) * multiplier); } - function dynamicTimeout(torrent) { if (torrent.seeders < 5) { - return 2000; - } else if (torrent.seeders < 10) { return 3000; - } else if (torrent.seeders < 20) { + } else if (torrent.seeders < 10) { return 4000; - } else if (torrent.seeders < 30) { + } else if (torrent.seeders < 20) { return 5000; - } else if (torrent.seeders < 50) { + } else if (torrent.seeders < 30) { return 7000; + } else if (torrent.seeders < 50) { + return 9000; } else if (torrent.seeders < 100) { - return 10000; + return 12000; } else { return 15000; } } +function getTrackerList() { + return needle('get', 'https://torrents.me/tracker-list/', { open_timeout: 2000 }) + .then((response) => { + if (!response.body || response.statusCode !== 200) { + throw new Error('tracker list not found') + } + return response.body + }) + .then((body) => cheerio.load(body)) + .then(($) => $('div[class="small-12 columns"] pre').text()) + .then((text) => text.replace(/"/g, '').trim().split('\n')) +} + diff --git a/package-lock.json b/package-lock.json index c36b302..e682689 100644 --- a/package-lock.json +++ b/package-lock.json @@ -57,6 +57,11 @@ "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" }, + "async": { + "version": "0.9.2", + "resolved": "https://registry.npmjs.org/async/-/async-0.9.2.tgz", + "integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0=" + }, "asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", @@ -293,6 +298,11 @@ "resolved": "https://registry.npmjs.org/buffer-writer/-/buffer-writer-2.0.0.tgz", "integrity": "sha512-a7ZpuTZU1TRtnwyCNW3I5dc0wWNC3VR9S++Ewyk2HHZdrO3CQJqSpd+95Us590V6AL7JqUAH2IwZ/398PmNFgw==" }, + "byline": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/byline/-/byline-3.1.2.tgz", + "integrity": "sha1-hqY5TLGz8PwzT7F2sI5hrwxvHFk=" + }, "bytes": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", @@ -1385,6 +1395,51 @@ "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" }, + "peer-search": { + "version": "0.6.11", + "resolved": "https://registry.npmjs.org/peer-search/-/peer-search-0.6.11.tgz", + "integrity": "sha1-Nfqs2lj5+YeDNOmj2A06PRzz7ag=", + "requires": { + "async": "0.9.x", + "bncode": "~0.2.3", + "byline": "3.1.2", + "compact2string": "~1.0.0", + "hat": "0.0.3" + }, + "dependencies": { + "bncode": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/bncode/-/bncode-0.2.3.tgz", + "integrity": "sha1-N/hR3I5HGIqD+8D2+kd1ysyaMpY=" + }, + "compact2string": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/compact2string/-/compact2string-1.0.0.tgz", + "integrity": "sha1-S8jc7XudYKX0NFSWp6vO8hpgzvU=" + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + }, + "needle": { + "version": "git+https://github.com/Ivshti/needle.git#16cceb74e24babd9c474071be1a15a09beac300d", + "from": "git+https://github.com/Ivshti/needle.git", + "requires": { + "debug": "^2.1.2", + "iconv-lite": "^0.4.4" + } + } + } + }, "peer-wire-protocol": { "version": "0.7.1", "resolved": "https://registry.npmjs.org/peer-wire-protocol/-/peer-wire-protocol-0.7.1.tgz", diff --git a/package.json b/package.json index 396184c..4ac42bb 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "node-gzip": "^1.1.2", "parse-torrent": "^6.1.2", "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#master", + "peer-search": "^0.6.x", "pg": "^7.8.2", "pg-hstore": "^2.3.2", "sequelize": "^4.43.0", diff --git a/scrapers/horiblesubs_scraper.js b/scrapers/horiblesubs_scraper.js new file mode 100644 index 0000000..6f338f8 --- /dev/null +++ b/scrapers/horiblesubs_scraper.js @@ -0,0 +1,136 @@ +const moment = require('moment'); +const needle = require('needle'); +const Bottleneck = require('bottleneck'); +const { parse } = require('parse-torrent-title'); +const decode = require('magnet-uri'); +const horriblesubs = require('./api/horriblesubs'); +const { torrentFiles, currentSeeders } = require('../lib/torrent'); +const repository = require('../lib/repository'); +const { getImdbId, getMetadata } = require('../lib/metadata'); + +const NAME = 'HorribleSubs'; + +const limiter = new Bottleneck({maxConcurrent: 1}); +const entryLimiter = new Bottleneck({maxConcurrent: 20}); + +async function scrape() { + const lastScraped = await repository.getProvider({ name: NAME }); + + if (!lastScraped.lastScraped) { + console.log(`${NAME}: no previous scrapping exist`); + await _scrapeAllShows() + } +} + +async function _scrapeAllShows() { + console.log(`${NAME}: getting all shows...`); + const shows = await horriblesubs.allShows(); + + Promise.all(shows + .slice(0, 20) + //.filter(show => show.url.includes('piece')) + .map((show) => limiter.schedule(() => horriblesubs.showData(show) + .then((showData) => _parseShowData(showData)) + .catch((err) => console.log(err))))); +} + +const hardcodedShows = { + '199': 'tt2098220', + '347': 'tt0388629' +}; + +async function _parseShowData(showData) { + console.log(`${NAME}: scrapping ${showData.title} data...`); + const imdbId = hardcodedShows[showData.showId] || await getImdbId({ + name: showData.title.replace(/\W+/g, ' ').toLowerCase(), + type: 'series' + }).catch(() => undefined); + const metadata = imdbId && await getMetadata(imdbId, 'series') || {}; + + return Promise.all([ + showData.singleEpisodes + .map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror)))) + .reduce((a, b) => a.concat(b), []), + showData.packEpisodes + .map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror)))) + .reduce((a, b) => a.concat(b), []) + ].reduce((a, b) => a.concat(b), [])) + .then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent))); +} + +async function _constructSingleEntry(metadata, single, mirror) { + mirror.infoHash = decode(mirror.magnetLink).infoHash; + const seeders = await currentSeeders(mirror); + const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/); + const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season + const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode + const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode); + const title = `${single.title} ${single.episode} [${mirror.resolution}]`; + const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode}; + + return { + infoHash: mirror.infoHash, + provider: NAME, + title: title, + type: 'anime', + imdbId: metadata.imdbId, + uploadDate: single.uploadDate, + seeders: seeders, + files: [file] + } +} + +async function _constructPackEntry(metadata, pack, mirror) { + mirror.infoHash = decode(mirror.magnetLink).infoHash; + const seeders = await currentSeeders(mirror); + const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/); + const xSeason = seasonMatch && parseInt(seasonMatch[1]); + + const files = await torrentFiles(mirror) + .then((files) => files.map((file) => { + const title = file.path.match(/[^\/]+$/)[0]; + const titleInfo = parse(title.replace(pack.title, '')); + return titleInfo.episodes + .map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode)) + .map((actual) => ({ + title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode + })); + })) + .then((files) => files.reduce((a, b) => a.concat(b), [])) + .catch(() => []); + + return { + infoHash: mirror.infoHash, + provider: NAME, + title: `${pack.title} ${pack.episode} [${mirror.resolution}]`, + type: 'anime', + imdbId: metadata.imdbId, + uploadDate: pack.uploadDate, + seeders: seeders, + files: files + } +} + +function actualSeasonEpisode(metadata, xSeason, xEpisode) { + if (xSeason) { + return { + season: xSeason, + episode: xEpisode, + absoluteEpisode: metadata.episodeCount && metadata.episodeCount + .slice(0, xSeason - 1) + .reduce((a, b) => a + b, xEpisode), + } + } else if (metadata.episodeCount) { + return metadata.episodeCount + .reduce((epInfo, epCount) => { + if (epInfo.episode > epCount) { + epInfo.season = epInfo.season + 1; + epInfo.episode = epInfo.episode - epCount; + } + return epInfo; + }, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode }) + } + return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode } +} + +module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/piratebay_dump.js b/scrapers/piratebay_dump.js index 1278835..f8bda6c 100644 --- a/scrapers/piratebay_dump.js +++ b/scrapers/piratebay_dump.js @@ -10,7 +10,7 @@ const { torrentFiles } = require('../lib/torrent'); const repository = require('../lib/repository'); const { getImdbId } = require('../lib/metadata'); -const NAME = 'thepiratebay'; +const NAME = 'ThePirateBay'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; const limiter = new Bottleneck({maxConcurrent: 40});