mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
adds horriblesubs scrapper
This commit is contained in:
14
index.js
14
index.js
@@ -1,10 +1,11 @@
|
||||
const express = require("express");
|
||||
const server = express();
|
||||
const { init } = require('./lib/torrent');
|
||||
const { connect } = require('./lib/repository');
|
||||
const tpbDump = require('./scrapers/piratebay_dump');
|
||||
const horribleSubs = require('./scrapers/api/horriblesubs');
|
||||
const horribleSubsScraper = require('./scrapers/horiblesubs_scraper');
|
||||
|
||||
const providers = [tpbDump];
|
||||
const providers = [horribleSubsScraper];
|
||||
|
||||
async function scrape() {
|
||||
providers.forEach((provider) => provider.scrape());
|
||||
@@ -17,12 +18,7 @@ server.post('/scrape', function(req, res) {
|
||||
|
||||
server.listen(7000, async function () {
|
||||
await connect();
|
||||
await init();
|
||||
console.log('Scraper started');
|
||||
// const shows = await horribleSubs.allShows();
|
||||
// console.log(shows);
|
||||
// const showInfo = await horribleSubs.showData('/shows/one-piece');
|
||||
// console.log(showInfo)
|
||||
// const latestEntries = await horribleSubs.getLatestEntries();
|
||||
// console.log(latestEntries);
|
||||
//scrape();
|
||||
scrape();
|
||||
});
|
||||
@@ -10,6 +10,7 @@ function getMetadata(imdbId, type) {
|
||||
.then((body) => {
|
||||
if (body && body.meta && body.meta.name) {
|
||||
return {
|
||||
imdbId: imdbId,
|
||||
title: body.meta.name,
|
||||
year: body.meta.year,
|
||||
genres: body.meta.genres,
|
||||
@@ -38,14 +39,6 @@ function escapeTitle(title, hyphenEscape = true) {
|
||||
.trim();
|
||||
}
|
||||
|
||||
const hardcodedTitles = {
|
||||
'tt0388629': 'one piece',
|
||||
'tt0182629': 'rurouni kenshin',
|
||||
'tt2098220': 'hunter x hunter 2011',
|
||||
'tt1409055': 'dragon ball kai',
|
||||
'tt7441658': 'black clover tv'
|
||||
};
|
||||
|
||||
async function seriesMetadata(id) {
|
||||
const idInfo = id.split(':');
|
||||
const imdbId = idInfo[0];
|
||||
@@ -54,7 +47,6 @@ async function seriesMetadata(id) {
|
||||
|
||||
const metadata = await getMetadata(imdbId, 'series');
|
||||
const title = escapeTitle(metadata.title);
|
||||
const hasEpisodeCount = metadata.episodeCount && metadata.episodeCount.length >= season;
|
||||
|
||||
return {
|
||||
imdb: imdbId,
|
||||
@@ -91,4 +83,4 @@ async function getImdbId(info) {
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { movieMetadata, seriesMetadata, getImdbId };
|
||||
module.exports = { getMetadata, movieMetadata, seriesMetadata, getImdbId };
|
||||
|
||||
@@ -5,18 +5,19 @@ const POSTGRES_URI = process.env.POSTGRES_URI || 'postgres://torrentio:postgres@
|
||||
const database = new Sequelize(POSTGRES_URI, { logging: false });
|
||||
|
||||
const Provider = database.define('provider', {
|
||||
name: { type: Sequelize.STRING(16), primaryKey: true},
|
||||
name: { type: Sequelize.STRING(32), primaryKey: true},
|
||||
lastScraped: { type: Sequelize.DATE }
|
||||
});
|
||||
|
||||
const Torrent = database.define('torrent', {
|
||||
infoHash: { type: Sequelize.STRING(64), primaryKey: true },
|
||||
provider: { type: Sequelize.STRING(16), allowNull: false },
|
||||
provider: { type: Sequelize.STRING(32), allowNull: false },
|
||||
title: { type: Sequelize.STRING(128), allowNull: false },
|
||||
type: { type: Sequelize.STRING(16), allowNull: false },
|
||||
imdbId: { type: Sequelize.STRING(12) },
|
||||
uploadDate: { type: Sequelize.DATE, allowNull: false },
|
||||
seeders: { type: Sequelize.SMALLINT },
|
||||
files: { type: Sequelize.ARRAY(Sequelize.TEXT) },
|
||||
files: { type: Sequelize.JSONB }
|
||||
});
|
||||
|
||||
const SkipTorrent = database.define('skip_torrent', {
|
||||
|
||||
@@ -2,17 +2,36 @@ const torrentStream = require('torrent-stream');
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const parseTorrent = require('parse-torrent');
|
||||
const cloudscraper = require('cloudscraper');
|
||||
const Tracker = require("peer-search/tracker");
|
||||
|
||||
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
|
||||
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"];
|
||||
let TRACKERS = [];
|
||||
|
||||
module.exports.init = async function() {
|
||||
TRACKERS = await getTrackerList();
|
||||
};
|
||||
|
||||
module.exports.torrentFiles = function(torrent) {
|
||||
return filesFromKat(torrent.infoHash)
|
||||
return filesFromTorrentFile(torrent)
|
||||
.catch(() => filesFromKat(torrent.infoHash))
|
||||
.catch(() => filesFromTorrentStream(torrent))
|
||||
.then((files) => files
|
||||
.filter((file) => isVideo(file))
|
||||
.map((file) => `${file.fileIndex}@@${file.path}`));
|
||||
.then((files) => files.filter((file) => isVideo(file)));
|
||||
};
|
||||
|
||||
module.exports.currentSeeders = function (torrent) {
|
||||
if (!torrent.infoHash) {
|
||||
return Promise.reject(new Error("no infoHash"));
|
||||
}
|
||||
return new Promise((resolve) =>
|
||||
Promise.all(TRACKERS.map((tracker) => new Promise((resolve) => {
|
||||
const t = new Tracker(tracker, { }, torrent.infoHash);
|
||||
t.run();
|
||||
t.on("info", (inf) => resolve([inf.seeders, inf.leechers]));
|
||||
setTimeout(() => resolve([0, 0]), 1000);
|
||||
}))
|
||||
).then((results) => resolve(results.reduce((seeders, next) => seeders + next[0], 0)))
|
||||
);
|
||||
};
|
||||
|
||||
// async function filesFromBtSeeds(infoHash) {
|
||||
@@ -24,10 +43,13 @@ module.exports.torrentFiles = function(torrent) {
|
||||
// }
|
||||
|
||||
function filesFromKat(infoHash) {
|
||||
if (!infoHash) {
|
||||
return Promise.reject(new Error("no infoHash"));
|
||||
}
|
||||
const url = `http://kat.rip/torrent/${infoHash}.html`;
|
||||
return needle('get', url, { open_timeout: 2000 })
|
||||
.then((response) => {
|
||||
if (!response.body) {
|
||||
if (!response.body || response.statusCode !== 200) {
|
||||
throw new Error('torrent not found in kat')
|
||||
}
|
||||
return response.body
|
||||
@@ -43,13 +65,41 @@ function filesFromKat(infoHash) {
|
||||
size: convertToBytes($(row).find('td[class=\'torFileSize\']').text())
|
||||
});
|
||||
});
|
||||
|
||||
if (!files[files.length - 1].size) {
|
||||
throw new Error('not full file list')
|
||||
}
|
||||
return files;
|
||||
})
|
||||
}
|
||||
|
||||
async function filesFromTorrentFile(torrent) {
|
||||
if (!torrent.torrentLink) {
|
||||
return Promise.reject(new Error("no torrentLink"));
|
||||
}
|
||||
|
||||
needle('get', torrent.torrentLink, { open_timeout: 2000 })
|
||||
.then((response) => {
|
||||
if (!response.body || response.statusCode !== 200) {
|
||||
throw new Error('torrent not found')
|
||||
}
|
||||
return response.body
|
||||
})
|
||||
.then((body) => parseTorrent(body))
|
||||
.then((info) => info.files.map((file, fileId) => ({
|
||||
fileIndex: fileId,
|
||||
name: file.name,
|
||||
path: file.path.replace(/^[^\/]+\//, ''),
|
||||
size: file.length
|
||||
})));
|
||||
}
|
||||
|
||||
async function filesFromTorrentStream(torrent) {
|
||||
if (!torrent.infoHash && !torrent.magnetLink) {
|
||||
return Promise.reject(new Error("no infoHash or magnetLink"));
|
||||
}
|
||||
return new Promise((resolve, rejected) => {
|
||||
const engine = new torrentStream(torrent.infoHash, { connections: MAX_PEER_CONNECTIONS });
|
||||
const engine = new torrentStream(torrent.magnetLink || torrent.infoHash, { connections: MAX_PEER_CONNECTIONS, trackers: TRACKERS });
|
||||
|
||||
engine.ready(() => {
|
||||
const files = engine.files
|
||||
@@ -74,7 +124,6 @@ function isVideo(title) {
|
||||
return EXTENSIONS.includes(title.path.match(/\.(\w{2,4})$/)[1]);
|
||||
}
|
||||
|
||||
|
||||
function convertToBytes(sizeString) {
|
||||
if (!sizeString) {
|
||||
return;
|
||||
@@ -88,22 +137,34 @@ function convertToBytes(sizeString) {
|
||||
return Math.floor(parseFloat(sizeString) * multiplier);
|
||||
}
|
||||
|
||||
|
||||
function dynamicTimeout(torrent) {
|
||||
if (torrent.seeders < 5) {
|
||||
return 2000;
|
||||
} else if (torrent.seeders < 10) {
|
||||
return 3000;
|
||||
} else if (torrent.seeders < 20) {
|
||||
} else if (torrent.seeders < 10) {
|
||||
return 4000;
|
||||
} else if (torrent.seeders < 30) {
|
||||
} else if (torrent.seeders < 20) {
|
||||
return 5000;
|
||||
} else if (torrent.seeders < 50) {
|
||||
} else if (torrent.seeders < 30) {
|
||||
return 7000;
|
||||
} else if (torrent.seeders < 50) {
|
||||
return 9000;
|
||||
} else if (torrent.seeders < 100) {
|
||||
return 10000;
|
||||
return 12000;
|
||||
} else {
|
||||
return 15000;
|
||||
}
|
||||
}
|
||||
|
||||
function getTrackerList() {
|
||||
return needle('get', 'https://torrents.me/tracker-list/', { open_timeout: 2000 })
|
||||
.then((response) => {
|
||||
if (!response.body || response.statusCode !== 200) {
|
||||
throw new Error('tracker list not found')
|
||||
}
|
||||
return response.body
|
||||
})
|
||||
.then((body) => cheerio.load(body))
|
||||
.then(($) => $('div[class="small-12 columns"] pre').text())
|
||||
.then((text) => text.replace(/"/g, '').trim().split('\n'))
|
||||
}
|
||||
|
||||
|
||||
55
package-lock.json
generated
55
package-lock.json
generated
@@ -57,6 +57,11 @@
|
||||
"resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
|
||||
"integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
|
||||
},
|
||||
"async": {
|
||||
"version": "0.9.2",
|
||||
"resolved": "https://registry.npmjs.org/async/-/async-0.9.2.tgz",
|
||||
"integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0="
|
||||
},
|
||||
"asynckit": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
|
||||
@@ -293,6 +298,11 @@
|
||||
"resolved": "https://registry.npmjs.org/buffer-writer/-/buffer-writer-2.0.0.tgz",
|
||||
"integrity": "sha512-a7ZpuTZU1TRtnwyCNW3I5dc0wWNC3VR9S++Ewyk2HHZdrO3CQJqSpd+95Us590V6AL7JqUAH2IwZ/398PmNFgw=="
|
||||
},
|
||||
"byline": {
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/byline/-/byline-3.1.2.tgz",
|
||||
"integrity": "sha1-hqY5TLGz8PwzT7F2sI5hrwxvHFk="
|
||||
},
|
||||
"bytes": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
|
||||
@@ -1385,6 +1395,51 @@
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
|
||||
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
|
||||
},
|
||||
"peer-search": {
|
||||
"version": "0.6.11",
|
||||
"resolved": "https://registry.npmjs.org/peer-search/-/peer-search-0.6.11.tgz",
|
||||
"integrity": "sha1-Nfqs2lj5+YeDNOmj2A06PRzz7ag=",
|
||||
"requires": {
|
||||
"async": "0.9.x",
|
||||
"bncode": "~0.2.3",
|
||||
"byline": "3.1.2",
|
||||
"compact2string": "~1.0.0",
|
||||
"hat": "0.0.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"bncode": {
|
||||
"version": "0.2.3",
|
||||
"resolved": "https://registry.npmjs.org/bncode/-/bncode-0.2.3.tgz",
|
||||
"integrity": "sha1-N/hR3I5HGIqD+8D2+kd1ysyaMpY="
|
||||
},
|
||||
"compact2string": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/compact2string/-/compact2string-1.0.0.tgz",
|
||||
"integrity": "sha1-S8jc7XudYKX0NFSWp6vO8hpgzvU="
|
||||
},
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
},
|
||||
"needle": {
|
||||
"version": "git+https://github.com/Ivshti/needle.git#16cceb74e24babd9c474071be1a15a09beac300d",
|
||||
"from": "git+https://github.com/Ivshti/needle.git",
|
||||
"requires": {
|
||||
"debug": "^2.1.2",
|
||||
"iconv-lite": "^0.4.4"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"peer-wire-protocol": {
|
||||
"version": "0.7.1",
|
||||
"resolved": "https://registry.npmjs.org/peer-wire-protocol/-/peer-wire-protocol-0.7.1.tgz",
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
"node-gzip": "^1.1.2",
|
||||
"parse-torrent": "^6.1.2",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#master",
|
||||
"peer-search": "^0.6.x",
|
||||
"pg": "^7.8.2",
|
||||
"pg-hstore": "^2.3.2",
|
||||
"sequelize": "^4.43.0",
|
||||
|
||||
136
scrapers/horiblesubs_scraper.js
Normal file
136
scrapers/horiblesubs_scraper.js
Normal file
@@ -0,0 +1,136 @@
|
||||
const moment = require('moment');
|
||||
const needle = require('needle');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const decode = require('magnet-uri');
|
||||
const horriblesubs = require('./api/horriblesubs');
|
||||
const { torrentFiles, currentSeeders } = require('../lib/torrent');
|
||||
const repository = require('../lib/repository');
|
||||
const { getImdbId, getMetadata } = require('../lib/metadata');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
|
||||
const limiter = new Bottleneck({maxConcurrent: 1});
|
||||
const entryLimiter = new Bottleneck({maxConcurrent: 20});
|
||||
|
||||
async function scrape() {
|
||||
const lastScraped = await repository.getProvider({ name: NAME });
|
||||
|
||||
if (!lastScraped.lastScraped) {
|
||||
console.log(`${NAME}: no previous scrapping exist`);
|
||||
await _scrapeAllShows()
|
||||
}
|
||||
}
|
||||
|
||||
async function _scrapeAllShows() {
|
||||
console.log(`${NAME}: getting all shows...`);
|
||||
const shows = await horriblesubs.allShows();
|
||||
|
||||
Promise.all(shows
|
||||
.slice(0, 20)
|
||||
//.filter(show => show.url.includes('piece'))
|
||||
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
|
||||
.then((showData) => _parseShowData(showData))
|
||||
.catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
const hardcodedShows = {
|
||||
'199': 'tt2098220',
|
||||
'347': 'tt0388629'
|
||||
};
|
||||
|
||||
async function _parseShowData(showData) {
|
||||
console.log(`${NAME}: scrapping ${showData.title} data...`);
|
||||
const imdbId = hardcodedShows[showData.showId] || await getImdbId({
|
||||
name: showData.title.replace(/\W+/g, ' ').toLowerCase(),
|
||||
type: 'series'
|
||||
}).catch(() => undefined);
|
||||
const metadata = imdbId && await getMetadata(imdbId, 'series') || {};
|
||||
|
||||
return Promise.all([
|
||||
showData.singleEpisodes
|
||||
.map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror))))
|
||||
.reduce((a, b) => a.concat(b), []),
|
||||
showData.packEpisodes
|
||||
.map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror))))
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
].reduce((a, b) => a.concat(b), []))
|
||||
.then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent)));
|
||||
}
|
||||
|
||||
async function _constructSingleEntry(metadata, single, mirror) {
|
||||
mirror.infoHash = decode(mirror.magnetLink).infoHash;
|
||||
const seeders = await currentSeeders(mirror);
|
||||
const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/);
|
||||
const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season
|
||||
const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode
|
||||
const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode);
|
||||
const title = `${single.title} ${single.episode} [${mirror.resolution}]`;
|
||||
const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode};
|
||||
|
||||
return {
|
||||
infoHash: mirror.infoHash,
|
||||
provider: NAME,
|
||||
title: title,
|
||||
type: 'anime',
|
||||
imdbId: metadata.imdbId,
|
||||
uploadDate: single.uploadDate,
|
||||
seeders: seeders,
|
||||
files: [file]
|
||||
}
|
||||
}
|
||||
|
||||
async function _constructPackEntry(metadata, pack, mirror) {
|
||||
mirror.infoHash = decode(mirror.magnetLink).infoHash;
|
||||
const seeders = await currentSeeders(mirror);
|
||||
const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/);
|
||||
const xSeason = seasonMatch && parseInt(seasonMatch[1]);
|
||||
|
||||
const files = await torrentFiles(mirror)
|
||||
.then((files) => files.map((file) => {
|
||||
const title = file.path.match(/[^\/]+$/)[0];
|
||||
const titleInfo = parse(title.replace(pack.title, ''));
|
||||
return titleInfo.episodes
|
||||
.map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode))
|
||||
.map((actual) => ({
|
||||
title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode
|
||||
}));
|
||||
}))
|
||||
.then((files) => files.reduce((a, b) => a.concat(b), []))
|
||||
.catch(() => []);
|
||||
|
||||
return {
|
||||
infoHash: mirror.infoHash,
|
||||
provider: NAME,
|
||||
title: `${pack.title} ${pack.episode} [${mirror.resolution}]`,
|
||||
type: 'anime',
|
||||
imdbId: metadata.imdbId,
|
||||
uploadDate: pack.uploadDate,
|
||||
seeders: seeders,
|
||||
files: files
|
||||
}
|
||||
}
|
||||
|
||||
function actualSeasonEpisode(metadata, xSeason, xEpisode) {
|
||||
if (xSeason) {
|
||||
return {
|
||||
season: xSeason,
|
||||
episode: xEpisode,
|
||||
absoluteEpisode: metadata.episodeCount && metadata.episodeCount
|
||||
.slice(0, xSeason - 1)
|
||||
.reduce((a, b) => a + b, xEpisode),
|
||||
}
|
||||
} else if (metadata.episodeCount) {
|
||||
return metadata.episodeCount
|
||||
.reduce((epInfo, epCount) => {
|
||||
if (epInfo.episode > epCount) {
|
||||
epInfo.season = epInfo.season + 1;
|
||||
epInfo.episode = epInfo.episode - epCount;
|
||||
}
|
||||
return epInfo;
|
||||
}, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode })
|
||||
}
|
||||
return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode }
|
||||
}
|
||||
|
||||
module.exports = { scrape };
|
||||
@@ -10,7 +10,7 @@ const { torrentFiles } = require('../lib/torrent');
|
||||
const repository = require('../lib/repository');
|
||||
const { getImdbId } = require('../lib/metadata');
|
||||
|
||||
const NAME = 'thepiratebay';
|
||||
const NAME = 'ThePirateBay';
|
||||
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
||||
|
||||
const limiter = new Bottleneck({maxConcurrent: 40});
|
||||
|
||||
Reference in New Issue
Block a user