updates the package structure and updates initial horriblesubs scraper WIP

This commit is contained in:
TheBeastLT
2020-01-07 13:33:46 +01:00
parent 643917939b
commit 051c50de3f
12 changed files with 264 additions and 280 deletions

View File

@@ -1,172 +0,0 @@
const moment = require('moment');
const fs = require('fs');
const needle = require('needle');
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const decode = require('magnet-uri');
const horriblesubs = require('./api/horriblesubs');
const { Type } = require('../lib/types');
const { torrentFiles, currentSeeders } = require('../lib/torrent');
const repository = require('../lib/repository');
const { getImdbId, getMetadata, getKitsuId, getKitsuMetadata } = require('../lib/metadata');
const NAME = 'HorribleSubs';
const limiter = new Bottleneck({maxConcurrent: 5});
const entryLimiter = new Bottleneck({maxConcurrent: 20});
async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
if (!lastScraped.lastScraped) {
console.log(`${NAME}: no previous scrapping exist`);
await _scrapeAllShows()
}
}
async function _scrapeAllShows() {
initMapping();
// console.log(`${NAME}: getting all shows...`);
// const shows = await horriblesubs.allShows();
// Promise.all(shows
// .slice(0, 20)
// //.filter(show => show.url.includes('piece'))
// .map((show) => limiter.schedule(() => horriblesubs.showData(show)
// .then((showData) => _parseShowData(showData))
// .catch((err) => console.log(err)))));
}
async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`);
const currentMapping = require('../horrible_subs_mapping');
const mappings = Object.values(currentMapping);
const shows = await horriblesubs.allShows()
.then((shows) => shows.filter((show) => !mappings.find((mapping) => mapping.title === show.title)))
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), currentMapping));
fs.writeFile("./horrible_subs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
if (err) {
console.log("An error occurred while writing JSON Object to File.");
}
});
console.log(`${NAME}: finished kitsu mapping`);
}
async function enrichShow(show) {
console.log(`${NAME}: getting show info for ${show.title}...`);
const showId = await horriblesubs._getShowId(show.url)
.catch((error) => show.title);
const metadata = await getKitsuId(show.title)
.then((kitsuId) => getKitsuMetadata(kitsuId))
.catch((error) => {
console.log(`Failed getting kitsu meta: ${error.message}`);
return {};
});
return {
showId: showId,
...show,
kitsu_id: metadata.kitsu_id,
kitsuTitle: metadata.name,
kitsuSlug: metadata.slug,
imdb_id: metadata.imdb_id
}
}
async function _parseShowData(showData) {
console.log(`${NAME}: scrapping ${showData.title} data...`);
const imdbId = hardcodedShows[showData.showId] || await getImdbId({
name: showData.title.replace(/\W+/g, ' ').toLowerCase(),
type: 'series'
}).catch(() => undefined);
const metadata = imdbId && await getMetadata(imdbId, 'series') || {};
return Promise.all([
showData.singleEpisodes
.map((episode) => episode.mirrors.map((mirror) => entryLimiter.schedule(() => _constructSingleEntry(metadata, episode, mirror))))
.reduce((a, b) => a.concat(b), []),
showData.packEpisodes
.map((pack) => pack.mirrors.map((mirror) => entryLimiter.schedule(() =>_constructPackEntry(metadata, pack, mirror))))
.reduce((a, b) => a.concat(b), [])
].reduce((a, b) => a.concat(b), []))
.then((torrentEntries) => torrentEntries.forEach((torrent) => repository.updateTorrent(torrent)));
}
async function _constructSingleEntry(metadata, single, mirror) {
mirror.infoHash = decode(mirror.magnetLink).infoHash;
const seeders = await currentSeeders(mirror);
const seasonMatch = single.title.match(/[Ss]?(\d{1,2})\W*$/);
const xSeason = seasonMatch && parseInt(seasonMatch[1]); // could have a season
const xEpisode = parseInt(single.episode); // could be a seasonal or absolute episode
const { season, episode, absoluteEpisode } = actualSeasonEpisode(metadata, xSeason, xEpisode);
const title = `${single.title} ${single.episode} [${mirror.resolution}]`;
const file = { title: title, season: season, episode: episode, absoluteEpisode: absoluteEpisode};
return {
infoHash: mirror.infoHash,
provider: NAME,
title: title,
type: Type.ANIME,
imdbId: metadata.imdbId,
uploadDate: single.uploadDate,
seeders: seeders,
files: [file]
}
}
async function _constructPackEntry(metadata, pack, mirror) {
mirror.infoHash = decode(mirror.magnetLink).infoHash;
const seeders = await currentSeeders(mirror);
const seasonMatch = pack.title.match(/[Ss]?(\d{1,2})\W*$/);
const xSeason = seasonMatch && parseInt(seasonMatch[1]);
const files = await torrentFiles(mirror)
.then((files) => files.map((file) => {
const title = file.path.match(/[^\/]+$/)[0];
const titleInfo = parse(title.replace(pack.title, ''));
return titleInfo.episodes
.map((xEpisode) => actualSeasonEpisode(metadata, xSeason, xEpisode))
.map((actual) => ({
title: title, season: actual.season, episode: actual.episode, absoluteEpisode: actual.absoluteEpisode
}));
}))
.then((files) => files.reduce((a, b) => a.concat(b), []))
.catch(() => []);
return {
infoHash: mirror.infoHash,
provider: NAME,
title: `${pack.title} ${pack.episode} [${mirror.resolution}]`,
type: 'anime',
imdbId: metadata.imdbId,
uploadDate: pack.uploadDate,
seeders: seeders,
files: files
}
}
function actualSeasonEpisode(metadata, xSeason, xEpisode) {
if (xSeason) {
return {
season: xSeason,
episode: xEpisode,
absoluteEpisode: metadata.episodeCount && metadata.episodeCount
.slice(0, xSeason - 1)
.reduce((a, b) => a + b, xEpisode),
}
} else if (metadata.episodeCount) {
return metadata.episodeCount
.reduce((epInfo, epCount) => {
if (epInfo.episode > epCount) {
epInfo.season = epInfo.season + 1;
epInfo.episode = epInfo.episode - epCount;
}
return epInfo;
}, { season: 1, episode: xEpisode, absoluteEpisode: xEpisode })
}
return { season: xSeason || 1, episode: xEpisode, absoluteEpisode: xEpisode }
}
module.exports = { scrape };

View File

@@ -1,13 +1,14 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const decode = require('magnet-uri');
const defaultUrl = 'https://horriblesubs.info';
const defaultTimeout = 5000;
function allShows(config = {}) {
return _getContent('/shows', config)
.then(($) => $('div[class="ind-show"]')
.then(($) => $('div[class="ind-show"]')
.map((index, element) => $(element).children('a'))
.map((index, element) => ({
title: element.attr('title'),
@@ -71,11 +72,12 @@ function _getEntries(endpoint, config) {
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
mirrors: $(element).find('div[class="rls-links-container"]').children()
.map((indexLink, elementLink) => ({
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
})).get()
.map((indexLink, elementLink) => ({
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
infoHash: decode($(elementLink).find('a[title="Magnet Link"]').attr('href')).infoHash,
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
})).get()
})).get())
}
@@ -84,13 +86,14 @@ function _getAllLatestEntries(config, page = 0) {
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
return _getContent(entriesEndpoint, config)
.then(($) => $('li a')
.map((index, element) => ({
urlEndpoint: $(element).attr('href'),
episode: $(element).find('strong').text()
})).get())
.then((entries) => entries.length < 12 ? entries :
_getAllLatestEntries(config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)))
.map((index, element) => ({
urlEndpoint: $(element).attr('href'),
episode: $(element).find('strong').text()
})).get())
.then((entries) => entries.length < 12
? entries
: _getAllLatestEntries(config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)))
}
async function _findLatestEntry(entry, config) {
@@ -99,8 +102,8 @@ async function _findLatestEntry(entry, config) {
let page = 0;
let reachedEnd = false;
while(!foundEntry && !reachedEnd) {
const allEntries = await _getAllEntries(showId, 'show', config, page,false);
while (!foundEntry && !reachedEnd) {
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
page = page + 1;
reachedEnd = allEntries.length === 0;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
const fs = require('fs');
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const horriblesubs = require('./horriblesubs_api.js');
const repository = require('../../lib/repository');
const { Type } = require('../../lib/types');
const { updateCurrentSeeders } = require('../../lib/torrent');
const { parseTorrentFiles } = require('../../lib/torrentFiles');
const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json');
const NAME = 'HorribleSubs';
const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
if (!lastScraped.lastScraped) {
console.log(`${NAME}: no previous scrapping exist`);
await _scrapeAllShows()
}
}
async function _scrapeAllShows() {
console.log(`${NAME}: getting all shows...`);
const shows = await horriblesubs.allShows();
return Promise.all(shows
.slice(0, 20)
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
.then((showData) => _parseShowData(showData))
.catch((err) => console.log(err)))));
}
async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`);
const shows = await horriblesubs.allShows()
.then((shows) => shows.filter((show) => !showMappings[show.title]))
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
fs.writeFile("./scrapers/horriblesubs/horriblesubs_mapping.json", JSON.stringify(shows), 'utf8', function (err) {
if (err) {
console.log("An error occurred while writing JSON Object to File.");
} else {
console.log(`${NAME}: finished kitsu mapping`);
}
});
}
async function enrichShow(show) {
console.log(`${NAME}: getting show info for ${show.title}...`);
const showId = await horriblesubs._getShowId(show.url)
.catch((error) => show.title);
const metadata = await getKitsuId(show.title)
.then((kitsuId) => getMetadata(kitsuId))
.catch((error) => {
console.log(`Failed getting kitsu meta: ${error.message}`);
return {};
});
return {
showId: showId,
...show,
kitsu_id: metadata.kitsu_id,
kitsuTitle: metadata.name,
imdb_id: metadata.imdb_id
}
}
async function _parseShowData(showData) {
console.log(`${NAME}: scrapping ${showData.title} data...`);
const showMapping = showMappings[showData.title];
const kitsuId = showMapping && showMapping.kitsu_id;
if (!showMapping) {
throw new Error(`No kitsu mapping found for ${showData.title}`);
}
if (!kitsuId) {
throw new Error(`No kitsuId found for ${showData.title}`);
}
return Promise.all([].concat(showData.singleEpisodes).concat(showData.packEpisodes)
.map((episodeInfo) => episodeInfo.mirrors
.map((mirror) => ({
provider: NAME,
...mirror,
title: `${episodeInfo.title} ${episodeInfo.episode} [${mirror.resolution}]`,
size: 300000000,
type: Type.ANIME,
uploadDate: episodeInfo.uploadDate,
})))
.reduce((a, b) => a.concat(b), [])
.map((incompleteTorrent) => entryLimiter.schedule(() => checkIfExists(incompleteTorrent)
.then((torrent) => torrent && updateCurrentSeeders(torrent))
.then((torrent) => torrent && parseTorrentFiles(torrent, undefined, kitsuId)
.then((files) => verifyFiles(files))
.then((files) => repository.createTorrent(torrent)
.then(() => files.forEach(file => repository.createFile(file)))
.then(() => console.log(`Created entry for ${torrent.title}`)))))))
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
}
function verifyFiles(files) {
if (files && files.length) {
return files;
}
throw new Error(`No video files found for: ${torrent.title}`);
}
async function checkIfExists(torrent) {
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
if (!existingTorrent) {
return torrent; // no torrent exists yet
} else if (existingTorrent.provider === NAME) {
return undefined; // torrent by this provider already exists
}
return { ...torrent, size: existingTorrent.size, seeders: existingTorrent.seeders };
}
module.exports = { scrape };

View File

@@ -1,39 +0,0 @@
require('dotenv').config();
const repository = require('../../lib/repository');
const { parseTorrentFiles } = require('../../lib/torrentFiles');
const { Type } = require('../../lib/types');
async function addMissingEpisodes() {
const torrent = { infoHash: '0ec780c2c7f8d5b38e61827f0b53c77c3d22f955' };
const torrentFiles = await require('../../lib/torrent').torrentFiles(torrent);
const storedFiles = await repository.getFiles(torrent)
.then((files) => files.reduce((map, next) => (map[next.fileIndex] = next, map), {}));
const imdbId = Object.values(storedFiles)[0].imdbId;
torrentFiles
.filter((file) => !storedFiles[file.fileIndex])
.map((file) => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.name,
size: file.size,
imdbId: imdbId,
imdbSeason: parseInt(file.name.match(/(\d+)[ .]?-[ .]?\d+/)[1], 10),
imdbEpisode: parseInt(file.name.match(/\d+[ .]?-[ .]?(\d+)/)[1], 10),
}))
.forEach((file) => repository.createFile(file));
}
async function findAllFiles() {
const torrent = {
infoHash: '6b95e5cfde9aaa71970a14f6bb6b9de19e2cbfa1',
title: '[OMDA] Bleach + Filmes + Ovas (480p-720p x264 AAC-MP3) [rich_jc]',
type: Type.SERIES
};
const imdbId = 'tt0434665';
return parseTorrentFiles(torrent, imdbId).then((files) => console.log(files));
}
//addMissingEpisodes().then(() => console.log('Finished'));
findAllFiles().then(() => console.log('Finished'));

View File

@@ -156,7 +156,8 @@ function parseBody(body) {
$('table[id=\'searchResult\'] tr').each(function() {
const name = $(this).find('.detLink').text();
if (!name || name === 'Do NOT download any torrent before hiding your IP with a VPN.') {
const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/);
if (!name || !sizeMatcher) {
return;
}
torrents.push({
@@ -166,7 +167,7 @@ function parseBody(body) {
magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'),
category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10),
subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10),
size: parseSize($(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/)[1])
size: parseSize(sizeMatcher[1])
});
});
resolve(torrents);

View File

@@ -5,12 +5,12 @@ const { ungzip } = require('node-gzip');
const LineByLineReader = require('line-by-line');
const fs = require('fs');
const { parse } = require('parse-torrent-title');
const pirata = require('./api/thepiratebay');
const thepiratebay = require('./thepiratebay_api.js');
const bing = require('nodejs-bing');
const { Type } = require('../lib/types');
const repository = require('../lib/repository');
const { getImdbId, escapeTitle } = require('../lib/metadata');
const { parseTorrentFiles } = require('../lib/torrentFiles');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const { getImdbId, escapeTitle } = require('../../lib/metadata');
const { parseTorrentFiles } = require('../../lib/torrentFiles');
const NAME = 'ThePirateBay';
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
@@ -21,7 +21,7 @@ async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
const lastDump = { updatedAt: 2147000000 };
const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
//const lastDump = await pirata.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) {
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
@@ -83,16 +83,16 @@ async function scrape() {
}
}
const allowedCategories = [
pirata.Categories.VIDEO.MOVIES,
pirata.Categories.VIDEO.MOVIES_HD,
pirata.Categories.VIDEO.MOVIES_DVDR,
pirata.Categories.VIDEO.MOVIES_3D,
pirata.Categories.VIDEO.TV_SHOWS,
pirata.Categories.VIDEO.TV_SHOWS_HD
thepiratebay.Categories.VIDEO.MOVIES,
thepiratebay.Categories.VIDEO.MOVIES_HD,
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
thepiratebay.Categories.VIDEO.MOVIES_3D,
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
const seriesCategories = [
pirata.Categories.VIDEO.TV_SHOWS,
pirata.Categories.VIDEO.TV_SHOWS_HD
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
async function processTorrentRecord(record) {
const alreadyExists = await repository.getSkipTorrent(record)
@@ -132,7 +132,7 @@ async function processTorrentRecord(record) {
seeders: torrentFound.seeders,
};
if (!imdbId) {
if (!imdbId && !titleInfo.complete) {
console.log(`imdbId not found: ${torrentFound.name}`);
repository.createFailedImdbTorrent(torrent);
return;
@@ -145,8 +145,8 @@ async function processTorrentRecord(record) {
}
repository.createTorrent(torrent)
.then(() => files.forEach(file => repository.createFile(file)));
console.log(`Created entry for ${torrentFound.name}`);
.then(() => files.forEach(file => repository.createFile(file)))
.then(() => console.log(`Created entry for ${torrentFound.name}`));
}
async function findTorrent(record) {
@@ -158,7 +158,7 @@ async function findTorrentInSource(record) {
let page = 0;
let torrentFound;
while (!torrentFound && page < 5) {
const torrents = await pirata.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0];
page = torrents.length === 0 ? 1000 : page + 1;
}
@@ -178,10 +178,19 @@ async function findTorrentViaBing(record) {
}
return result.link.match(/torrent\/(\w+)\//)[1];
})
.then((torrentId) => pirata.torrent(torrentId))
.then((torrentId) => thepiratebay.torrent(torrentId))
}
function downloadDump(dump) {
try {
if (fs.existsSync(CSV_FILE_PATH)) {
console.log('dump file already exist...');
return;
}
} catch(err) {
console.error(err)
}
console.log('downloading dump file...');
return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' })
.then((response) => response.body)