diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0b850e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.idea +/node_modules \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e2cb5bb --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Torrentio Scraper \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..13075ce --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: '3' +services: + database: + image: postgres + volumes: + - db-data:/var/lib/postgresql/data + ports: + - 5432:5432 + environment: + POSTGRES_USER: torrentio + POSTGRES_PASSWORD: postgres + POSTGRES_DB: torrentio + +volumes: + db-data: + driver: local \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 0000000..ebd5d2d --- /dev/null +++ b/index.js @@ -0,0 +1,24 @@ +const express = require("express"); +const server = express(); +const { connect } = require('./lib/repository'); +const tpbDump = require('./scrapers/piratebay_dump'); +const horribleSubs = require('./scrapers/api/horriblesubs'); + +const providers = [tpbDump]; + +async function scrape() { + providers.forEach((provider) => provider.scrape()); +} + +server.post('/scrape', function(req, res) { + scrape(); + res.send(200); +}); + +server.listen(7000, async function () { + await connect(); + console.log('Scraper started'); + const shows = await horribleSubs.allShows(); + console.log(shows) + //scrape(); +}); \ No newline at end of file diff --git a/lib/metadata.js b/lib/metadata.js new file mode 100644 index 0000000..ecc8890 --- /dev/null +++ b/lib/metadata.js @@ -0,0 +1,94 @@ +const _ = require('lodash'); +const needle = require('needle'); +const nameToImdb = require('name-to-imdb'); + +const CINEMETA_URL = 'https://v3-cinemeta.strem.io'; + +function getMetadata(imdbId, type) { + return needle('get', `${CINEMETA_URL}/meta/${type}/${imdbId}.json`, { open_timeout: 1000 }) + .then((response) => response.body) + .then((body) => { + if (body && body.meta && body.meta.name) { + return { + title: body.meta.name, + year: body.meta.year, + genres: body.meta.genres, + episodeCount: body.meta.videos && _.chain(body.meta.videos) + .countBy('season') + .toPairs() + .filter((pair) => pair[0] !== '0') + .sortBy((pair) => parseInt(pair[0], 10)) + .map((pair) => pair[1]) + .value() + }; + } else { + console.log(`failed cinemeta query: Empty Body`); + throw new Error('failed cinemeta query'); + } + }); +} + +function escapeTitle(title, hyphenEscape = true) { + return title.toLowerCase() + .normalize('NFKD') // normalize non-ASCII characters + .replace(/[\u0300-\u036F]/g, '') + .replace(/&/g, 'and') + .replace(hyphenEscape ? /[.,_+ -]+/g : /[.,_+ ]+/g, ' ') // replace dots, commas or underscores with spaces + .replace(/[^\w- ()]/gi, '') // remove all non-alphanumeric chars + .trim(); +} + +const hardcodedTitles = { + 'tt0388629': 'one piece', + 'tt0182629': 'rurouni kenshin', + 'tt2098220': 'hunter x hunter 2011', + 'tt1409055': 'dragon ball kai', + 'tt7441658': 'black clover tv' +}; + +async function seriesMetadata(id) { + const idInfo = id.split(':'); + const imdbId = idInfo[0]; + const season = parseInt(idInfo[1], 10); + const episode = parseInt(idInfo[2], 10); + + const metadata = await getMetadata(imdbId, 'series'); + const title = escapeTitle(metadata.title); + const hasEpisodeCount = metadata.episodeCount && metadata.episodeCount.length >= season; + + return { + imdb: imdbId, + title: hardcodedTitles[imdbId] || title, + season: season, + episode: episode, + absoluteEpisode: hasEpisodeCount && metadata.episodeCount.slice(0, season - 1).reduce((a, b) => a + b, episode), + genres: metadata.genres, + isAnime: !metadata.genres.length || metadata.genres.includes('Animation') + }; +} + +async function movieMetadata(id) { + const metadata = await getMetadata(id, 'movie'); + + return { + imdb: id, + title: escapeTitle(metadata.title), + year: metadata.year, + genres: metadata.genres, + isAnime: !metadata.genres.length || metadata.genres.includes('Animation') + }; +} + +async function getImdbId(info) { + return new Promise((resolve, reject) => { + nameToImdb(info, function(err, res) { + if (res) { + resolve(res); + } else { + reject(err || new Error('failed imdbId search')); + } + }); + }); +} + +module.exports = { movieMetadata, seriesMetadata, getImdbId }; diff --git a/lib/repository.js b/lib/repository.js new file mode 100644 index 0000000..df4c2e0 --- /dev/null +++ b/lib/repository.js @@ -0,0 +1,84 @@ +const { Sequelize }= require('sequelize'); + +const POSTGRES_URI = process.env.POSTGRES_URI || 'postgres://torrentio:postgres@localhost:5432/torrentio'; + +const database = new Sequelize(POSTGRES_URI, { logging: false }); + +const Provider = database.define('provider', { + name: { type: Sequelize.STRING(16), primaryKey: true}, + lastScraped: { type: Sequelize.DATE } +}); + +const Torrent = database.define('torrent', { + infoHash: { type: Sequelize.STRING(64), primaryKey: true }, + provider: { type: Sequelize.STRING(16), allowNull: false }, + title: { type: Sequelize.STRING(128), allowNull: false }, + imdbId: { type: Sequelize.STRING(12) }, + uploadDate: { type: Sequelize.DATE, allowNull: false }, + seeders: { type: Sequelize.SMALLINT }, + files: { type: Sequelize.ARRAY(Sequelize.TEXT) }, +}); + +const SkipTorrent = database.define('skip_torrent', { + infoHash: {type: Sequelize.STRING(64), primaryKey: true}, +}); + +const FailedImdbTorrent = database.define('failed_imdb_torrent', { + infoHash: {type: Sequelize.STRING(64), primaryKey: true}, +}); + +function connect() { + return database.sync({ alter: true }); +} + +function getProvider(provider) { + return Provider.findOrCreate({ where: { name: provider.name }, defaults: provider }); +} + +function updateProvider(provider) { + return Provider.update(provider); +} + +function getTorrent(torrent) { + return Torrent.findByPk(torrent.infoHash) + .then((result) =>{ + if (!result) { + throw new Error(`torrent not found: ${torrent.infoHash}`); + } + return result.dataValues; + }) +} + +function updateTorrent(torrent) { + return Torrent.upsert(torrent); +} + +function getSkipTorrent(torrent) { + return SkipTorrent.findByPk(torrent.infoHash) + .then((result) =>{ + if (!result) { + return getFailedImdbTorrent(torrent); + } + return result.dataValues; + }) +} + +function createSkipTorrent(torrent) { + return SkipTorrent.upsert({ infoHash: torrent.infoHash }); +} + +function getFailedImdbTorrent(torrent) { + return FailedImdbTorrent.findByPk(torrent.infoHash) + .then((result) =>{ + if (!result) { + throw new Error(`torrent not found: ${torrent.infoHash}`); + } + return result.dataValues; + }) +} + +function createFailedImdbTorrent(torrent) { + return FailedImdbTorrent.upsert({ infoHash: torrent.infoHash }); +} + +module.exports = { connect, getProvider, updateProvider, getTorrent, updateTorrent, getSkipTorrent, createSkipTorrent, createFailedImdbTorrent }; \ No newline at end of file diff --git a/lib/torrent.js b/lib/torrent.js new file mode 100644 index 0000000..a663971 --- /dev/null +++ b/lib/torrent.js @@ -0,0 +1,109 @@ +const torrentStream = require('torrent-stream'); +const cheerio = require('cheerio'); +const needle = require('needle'); +const parseTorrent = require('parse-torrent'); +const cloudscraper = require('cloudscraper'); + +const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; +const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"]; + +module.exports.torrentFiles = function(torrent) { + return filesFromKat(torrent.infoHash) + .catch(() => filesFromTorrentStream(torrent)) + .then((files) => files + .filter((file) => isVideo(file)) + .map((file) => `${file.fileIndex}@@${file.path}`)); +}; + +// async function filesFromBtSeeds(infoHash) { +// const url = `https://www.btseed.net/show/${infoHash}`; +// return needle('get', url, { open_timeout: 2000 }) +// .then((response) => response.body) +// .then((body) => body.match(/