adds horriblesubs scrapper

This commit is contained in:
TheBeastLT
2019-03-15 17:17:28 +01:00
parent fa80d34883
commit 88bddcff54
8 changed files with 280 additions and 38 deletions

View File

@@ -1,10 +1,11 @@
const express = require("express");
const server = express();
const { init } = require('./lib/torrent');
const { connect } = require('./lib/repository');
const tpbDump = require('./scrapers/piratebay_dump');
const horribleSubs = require('./scrapers/api/horriblesubs');
const horribleSubsScraper = require('./scrapers/horiblesubs_scraper');
const providers = [tpbDump];
const providers = [horribleSubsScraper];
async function scrape() {
providers.forEach((provider) => provider.scrape());
@@ -17,12 +18,7 @@ server.post('/scrape', function(req, res) {
server.listen(7000, async function () {
await connect();
await init();
console.log('Scraper started');
// const shows = await horribleSubs.allShows();
// console.log(shows);
// const showInfo = await horribleSubs.showData('/shows/one-piece');
// console.log(showInfo)
// const latestEntries = await horribleSubs.getLatestEntries();
// console.log(latestEntries);
//scrape();
scrape();
});

View File

@@ -10,6 +10,7 @@ function getMetadata(imdbId, type) {
.then((body) => {
if (body && body.meta && body.meta.name) {
return {
imdbId: imdbId,
title: body.meta.name,
year: body.meta.year,
genres: body.meta.genres,
@@ -38,14 +39,6 @@ function escapeTitle(title, hyphenEscape = true) {
.trim();
}
const hardcodedTitles = {
'tt0388629': 'one piece',
'tt0182629': 'rurouni kenshin',
'tt2098220': 'hunter x hunter 2011',
'tt1409055': 'dragon ball kai',
'tt7441658': 'black clover tv'
};
async function seriesMetadata(id) {
const idInfo = id.split(':');
const imdbId = idInfo[0];
@@ -54,7 +47,6 @@ async function seriesMetadata(id) {
const metadata = await getMetadata(imdbId, 'series');
const title = escapeTitle(metadata.title);
const hasEpisodeCount = metadata.episodeCount && metadata.episodeCount.length >= season;
return {
imdb: imdbId,
@@ -91,4 +83,4 @@ async function getImdbId(info) {
});
}
module.exports = { movieMetadata, seriesMetadata, getImdbId };
module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId };

View File

@@ -5,18 +5,19 @@ const POSTGRES_URI = process.env.POSTGRES_URI || 'postgres://torrentio:postgres@
const database = new Sequelize(POSTGRES_URI, { logging: false });
const Provider = database.define('provider', {
name: { type: Sequelize.STRING(16), primaryKey: true},
name: { type: Sequelize.STRING(32), primaryKey: true},
lastScraped: { type: Sequelize.DATE }
});
const Torrent = database.define('torrent', {
infoHash: { type: Sequelize.STRING(64), primaryKey: true },
provider: { type: Sequelize.STRING(16), allowNull: false },
provider: { type: Sequelize.STRING(32), allowNull: false },
title: { type: Sequelize.STRING(128), allowNull: false },
type: { type: Sequelize.STRING(16), allowNull: false },
imdbId: { type: Sequelize.STRING(12) },
uploadDate: { type: Sequelize.DATE, allowNull: false },
seeders: { type: Sequelize.SMALLINT },
files: { type: Sequelize.ARRAY(Sequelize.TEXT) },
files: { type: Sequelize.JSONB }
});
const SkipTorrent = database.define('skip_torrent', {

View File

@@ -2,17 +2,36 @@ const torrentStream = require('torrent-stream');
const cheerio = require('cheerio');
const needle = require('needle');
const parseTorrent = require('parse-torrent');
const cloudscraper = require('cloudscraper');
const Tracker = require("peer-search/tracker");
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"];
let TRACKERS = [];
module.exports.init = async function() {
TRACKERS = await getTrackerList();
};
module.exports.torrentFiles = function(torrent) {
return filesFromKat(torrent.infoHash)
return filesFromTorrentFile(torrent)
.catch(() => filesFromKat(torrent.infoHash))
.catch(() => filesFromTorrentStream(torrent))
.then((files) => files
.filter((file) => isVideo(file))
.map((file) => `${file.fileIndex}@@${file.path}`));
.then((files) => files.filter((file) => isVideo(file)));
};
module.exports.currentSeeders = function (torrent) {
if (!torrent.infoHash) {
return Promise.reject(new Error("no infoHash"));
}
return new Promise((resolve) =>
Promise.all(TRACKERS.map((tracker) => new Promise((resolve) => {
const t = new Tracker(tracker, { }, torrent.infoHash);
t.run();
t.on("info", (inf) => resolve([inf.seeders, inf.leechers]));
setTimeout(() => resolve([0, 0]), 1000);
}))
).then((results) => resolve(results.reduce((seeders, next) => seeders + next[0], 0)))
);
};
// async function filesFromBtSeeds(infoHash) {
@@ -24,10 +43,13 @@ module.exports.torrentFiles = function(torrent) {
// }
function filesFromKat(infoHash) {
if (!infoHash) {
return Promise.reject(new Error("no infoHash"));
}
const url = `http://kat.rip/torrent/${infoHash}.html`;
return needle('get', url, { open_timeout: 2000 })
.then((response) => {
if (!response.body) {
if (!response.body || response.statusCode !== 200) {
throw new Error('torrent not found in kat')
}
return response.body
@@ -43,13 +65,41 @@ function filesFromKat(infoHash) {
size: convertToBytes($(row).find('td[class=\'torFileSize\']').text())
});
});
if (!files[files.length - 1].size) {
throw new Error('not full file list')
}
return files;
})
}
async function filesFromTorrentFile(torrent) {
if (!torrent.torrentLink) {
return Promise.reject(new Error("no torrentLink"));
}
needle('get', torrent.torrentLink, { open_timeout: 2000 })
.then((response) => {
if (!response.body || response.statusCode !== 200) {
throw new Error('torrent not found')
}
return response.body
})
.then((body) => parseTorrent(body))
.then((info) => info.files.map((file, fileId) => ({
fileIndex: fileId,
name: file.name,
path: file.path.replace(/^[^\/]+\//, ''),
size: file.length
})));
}
async function filesFromTorrentStream(torrent) {
if (!torrent.infoHash && !torrent.magnetLink) {
return Promise.reject(new Error("no infoHash or magnetLink"));
}
return new Promise((resolve, rejected) => {
const engine = new torrentStream(torrent.infoHash, { connections: MAX_PEER_CONNECTIONS });
const engine = new torrentStream(torrent.magnetLink || torrent.infoHash, { connections: MAX_PEER_CONNECTIONS, trackers: TRACKERS });
engine.ready(() => {
const files = engine.files
@@ -74,7 +124,6 @@ function isVideo(title) {
return EXTENSIONS.includes(title.path.match(/\.(\w{2,4})$/)[1]);
}
function convertToBytes(sizeString) {
if (!sizeString) {
return;
@@ -88,22 +137,34 @@ function convertToBytes(sizeString) {
return Math.floor(parseFloat(sizeString) * multiplier);
}
function dynamicTimeout(torrent) {
if (torrent.seeders < 5) {
return 2000;
} else if (torrent.seeders < 10) {
return 3000;
} else if (torrent.seeders < 20) {
} else if (torrent.seeders < 10) {
return 4000;
} else if (torrent.seeders < 30) {
} else if (torrent.seeders < 20) {
return 5000;
} else if (torrent.seeders < 50) {
} else if (torrent.seeders < 30) {
return 7000;
} else if (torrent.seeders < 50) {
return 9000;
} else if (torrent.seeders < 100) {
return 10000;
return 12000;
} else {
return 15000;
}
}
function getTrackerList() {
return needle('get', 'https://torrents.me/tracker-list/', { open_timeout: 2000 })
.then((response) => {
if (!response.body || response.statusCode !== 200) {
throw new Error('tracker list not found')
}
return response.body
})
.then((body) => cheerio.load(body))
.then(($) => $('div[class="small-12 columns"] pre').text())
.then((text) => text.replace(/"/g, '').trim().split('\n'))
}

55
package-lock.json generated
View File

@@ -57,6 +57,11 @@
"resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
"integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
},
"async": {
"version": "0.9.2",
"resolved": "https://registry.npmjs.org/async/-/async-0.9.2.tgz",
"integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0="
},
"asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
@@ -293,6 +298,11 @@
"resolved": "https://registry.npmjs.org/buffer-writer/-/buffer-writer-2.0.0.tgz",
"integrity": "sha512-a7ZpuTZU1TRtnwyCNW3I5dc0wWNC3VR9S++Ewyk2HHZdrO3CQJqSpd+95Us590V6AL7JqUAH2IwZ/398PmNFgw=="
},
"byline": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/byline/-/byline-3.1.2.tgz",
"integrity": "sha1-hqY5TLGz8PwzT7F2sI5hrwxvHFk="
},
"bytes": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
@@ -1385,6 +1395,51 @@
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
},
"peer-search": {
"version": "0.6.11",
"resolved": "https://registry.npmjs.org/peer-search/-/peer-search-0.6.11.tgz",
"integrity": "sha1-Nfqs2lj5+YeDNOmj2A06PRzz7ag=",
"requires": {
"async": "0.9.x",
"bncode": "~0.2.3",
"byline": "3.1.2",
"compact2string": "~1.0.0",
"hat": "0.0.3"
},
"dependencies": {
"bncode": {
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/bncode/-/bncode-0.2.3.tgz",
"integrity": "sha1-N/hR3I5HGIqD+8D2+kd1ysyaMpY="
},
"compact2string": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/compact2string/-/compact2string-1.0.0.tgz",
"integrity": "sha1-S8jc7XudYKX0NFSWp6vO8hpgzvU="
},
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"needle": {
"version": "git+https://github.com/Ivshti/needle.git#16cceb74e24babd9c474071be1a15a09beac300d",
"from": "git+https://github.com/Ivshti/needle.git",
"requires": {
"debug": "^2.1.2",
"iconv-lite": "^0.4.4"
}
}
}
},
"peer-wire-protocol": {
"version": "0.7.1",
"resolved": "https://registry.npmjs.org/peer-wire-protocol/-/peer-wire-protocol-0.7.1.tgz",

View File

@@ -27,6 +27,7 @@
"node-gzip": "^1.1.2",
"parse-torrent": "^6.1.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#master",
"peer-search": "^0.6.x",
"pg": "^7.8.2",
"pg-hstore": "^2.3.2",
"sequelize": "^4.43.0",

View File

@@ -0,0 +1,136 @@
const moment = require('moment');
const needle = require('needle');
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const decode = require('magnet-uri');
const horriblesubs = require('./api/horriblesubs');
const { torrentFiles, currentSeeders } = require('../lib/torrent');
const repository = require('../lib/repository');
const { getImdbId, getMetadata } = require('../lib/metadata');
const NAME = 'HorribleSubs';
const limiter = new Bottleneck({maxConcurrent: 1});
const entryLimiter = new Bottleneck({maxConcurrent: 20});
async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
if (!lastScraped.lastScraped) {
console.log(`${NAME}: no previous scrapping exist`);
await _scrapeAllShows()
}
}
async function _scrapeAllShows() {
console.log(`${NAME}: getting all shows...`);
const shows = await horriblesubs.allShows();
Promise.all(shows
.slice(0, 20)
//.filter(show => show.url.includes('piece'))
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
.then((showData) => _parseShowData(showData))
.catch((err) => console.log(err)))));
}
const hardcodedShows = {
'199': 'tt2098220',
'347': 'tt0388629'
};
async function _parseShowData(showData) {
console.log(`${NAME}: scrapping ${showData.title} data...`);
const imdbId = hardcodedShows[showData.showId] || await getImdbId({
name: showData.title.replace(/\W+/g, ' ').toLowerCase(),
type: 'series'
}).catch(() => undefined);
const metadata = imdbId && await getMetadata(imdbId, 'series') || {};
return Promise.all([
showData.singleEpisodes
.map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror))))
.reduce((a, b) => a.concat(b), []),
showData.packEpisodes
.map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror))))
.reduce((a, b) => a.concat(b), [])
].reduce((a, b) => a.concat(b), []))
.then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent)));
}
async function _constructSingleEntry(metadata, single, mirror) {
mirror.infoHash = decode(mirror.magnetLink).infoHash;
const seeders = await currentSeeders(mirror);
const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/);
const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season
const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode
const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode);
const title = `${single.title} ${single.episode} [${mirror.resolution}]`;
const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode};
return {
infoHash: mirror.infoHash,
provider: NAME,
title: title,
type: 'anime',
imdbId: metadata.imdbId,
uploadDate: single.uploadDate,
seeders: seeders,
files: [file]
}
}
async function _constructPackEntry(metadata, pack, mirror) {
mirror.infoHash = decode(mirror.magnetLink).infoHash;
const seeders = await currentSeeders(mirror);
const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/);
const xSeason = seasonMatch && parseInt(seasonMatch[1]);
const files = await torrentFiles(mirror)
.then((files) => files.map((file) => {
const title = file.path.match(/[^\/]+$/)[0];
const titleInfo = parse(title.replace(pack.title, ''));
return titleInfo.episodes
.map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode))
.map((actual) => ({
title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode
}));
}))
.then((files) => files.reduce((a, b) => a.concat(b), []))
.catch(() => []);
return {
infoHash: mirror.infoHash,
provider: NAME,
title: `${pack.title} ${pack.episode} [${mirror.resolution}]`,
type: 'anime',
imdbId: metadata.imdbId,
uploadDate: pack.uploadDate,
seeders: seeders,
files: files
}
}
function actualSeasonEpisode(metadata, xSeason, xEpisode) {
if (xSeason) {
return {
season: xSeason,
episode: xEpisode,
absoluteEpisode: metadata.episodeCount && metadata.episodeCount
.slice(0, xSeason - 1)
.reduce((a, b) => a + b, xEpisode),
}
} else if (metadata.episodeCount) {
return metadata.episodeCount
.reduce((epInfo, epCount) => {
if (epInfo.episode > epCount) {
epInfo.season = epInfo.season + 1;
epInfo.episode = epInfo.episode - epCount;
}
return epInfo;
}, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode })
}
return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode }
}
module.exports = { scrape };

View File

@@ -10,7 +10,7 @@ const { torrentFiles } = require('../lib/torrent');
const repository = require('../lib/repository');
const { getImdbId } = require('../lib/metadata');
const NAME = 'thepiratebay';
const NAME = 'ThePirateBay';
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
const limiter = new Bottleneck({maxConcurrent: 40});