[scraper] saves opened torrent contents and videos and subs separately

This commit is contained in:
TheBeastLT
2020-05-02 10:16:03 +02:00
parent 4cf7bfb04a
commit 14b2f6b1a2
8 changed files with 224 additions and 94 deletions

4
package-lock.json generated
View File

@@ -1687,8 +1687,8 @@
} }
}, },
"parse-torrent-title": { "parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", "version": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", "from": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"requires": { "requires": {
"moment": "^2.24.0" "moment": "^2.24.0"
} }

View File

@@ -31,7 +31,7 @@
"node-schedule": "^1.3.2", "node-schedule": "^1.3.2",
"nodejs-bing": "^0.1.0", "nodejs-bing": "^0.1.0",
"parse-torrent": "^6.1.2", "parse-torrent": "^6.1.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"pg": "^7.8.2", "pg": "^7.8.2",
"pg-hstore": "^2.3.2", "pg-hstore": "^2.3.2",
"rarbg-api": "^1.1.3", "rarbg-api": "^1.1.3",

52
scraper/lib/extension.js Normal file
View File

@@ -0,0 +1,52 @@
const VIDEO_EXTENSIONS = [
"3g2",
"3gp",
"avi",
"flv",
"mkv",
"mov",
"mp2",
"mp4",
"m4v",
"mpe",
"mpeg",
"mpg",
"mpv",
"webm",
"wmv",
"ogm"
];
const SUBTITLE_EXTENSIONS = [
"aqt",
"gsub",
"jss",
"sub",
"ttxt",
"pjs",
"psb",
"rt",
"smi",
"slt",
"ssf",
"srt",
"ssa",
"ass",
"usf",
"idx",
"vtt"
];
function isVideo(filename) {
return isExtension(filename, VIDEO_EXTENSIONS);
}
function isSubtitle(filename) {
return isExtension(filename, SUBTITLE_EXTENSIONS);
}
function isExtension(filename, extensions) {
const extensionMatch = filename.match(/\.(\w{2,4})$/);
return extensionMatch && extensions.includes(extensionMatch[1].toLowerCase());
}
module.exports = { isVideo, isSubtitle }

View File

@@ -137,4 +137,4 @@ async function getKitsuId(info) {
})); }));
} }
module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML }; module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML, escapeTitle };

View File

@@ -31,7 +31,7 @@ const Torrent = database.define('torrent',
languages: { type: Sequelize.STRING(256) }, languages: { type: Sequelize.STRING(256) },
resolution: { type: Sequelize.STRING(16) }, resolution: { type: Sequelize.STRING(16) },
reviewed: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }, reviewed: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false },
subsChecked: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false } opened: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }
} }
); );
@@ -104,12 +104,36 @@ const Subtitle = database.define('subtitle',
} }
); );
const Content = database.define('content',
{
infoHash: {
type: Sequelize.STRING(64),
primaryKey: true,
allowNull: false,
references: { model: Torrent, key: 'infoHash' },
onDelete: 'CASCADE'
},
fileIndex: {
type: Sequelize.INTEGER,
primaryKey: true,
allowNull: false
},
path: { type: Sequelize.STRING(512), allowNull: false },
size: { type: Sequelize.BIGINT },
},
{
timestamps: false,
}
);
const SkipTorrent = database.define('skip_torrent', { const SkipTorrent = database.define('skip_torrent', {
infoHash: { type: Sequelize.STRING(64), primaryKey: true }, infoHash: { type: Sequelize.STRING(64), primaryKey: true },
}); });
Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false }); Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false });
File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false }); File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
Torrent.hasMany(Content, { foreignKey: 'infoHash', constraints: false });
Content.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false }); File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false });
Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false }); Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false });
@@ -163,7 +187,9 @@ function getUpdateSeedersTorrents() {
} }
function createTorrent(torrent) { function createTorrent(torrent) {
return Torrent.upsert(torrent); return Torrent.upsert(torrent)
.then(() => createContents(torrent.infoHash, torrent.contents))
.then(() => createSubtitles(torrent.infoHash, torrent.subtitles));
} }
function setTorrentSeeders(infoHash, seeders) { function setTorrentSeeders(infoHash, seeders) {
@@ -174,7 +200,7 @@ function setTorrentSeeders(infoHash, seeders) {
} }
function createFile(file) { function createFile(file) {
return File.upsert(file, { include: [Subtitle] }); return File.create(file, { include: [Subtitle] });
} }
function getFiles(torrent) { function getFiles(torrent) {
@@ -189,8 +215,27 @@ function deleteFile(file) {
return File.destroy({ where: { id: file.id } }) return File.destroy({ where: { id: file.id } })
} }
function createSubtitle(subtitle) { function createSubtitles(infoHash, subtitles) {
return Subtitle.upsert(subtitle); if (subtitles && subtitles.length) {
return Subtitle.bulkCreate(subtitles.map(subtitle => ({ infoHash, title: subtitle.path, ...subtitle })));
}
return Promise.resolve();
}
function getSubtitles(torrent) {
return Subtitle.findAll({ where: { infoHash: torrent.infoHash } });
}
function createContents(infoHash, contents) {
if (contents && contents.length) {
return Content.bulkCreate(contents.map(content => ({ infoHash, ...content })))
.then(() => Torrent.update({ opened: true }, { where: { infoHash: infoHash } }));
}
return Promise.resolve();
}
function getContents(torrent) {
return Content.findAll({ where: { infoHash: torrent.infoHash } });
} }
function getSkipTorrent(torrent) { function getSkipTorrent(torrent) {
@@ -219,7 +264,10 @@ module.exports = {
getFiles, getFiles,
getFilesBasedOnTitle, getFilesBasedOnTitle,
deleteFile, deleteFile,
createSubtitle, createSubtitles,
getSubtitles,
createContents,
getContents,
getSkipTorrent, getSkipTorrent,
createSkipTorrent, createSkipTorrent,
getTorrentsWithoutSize getTorrentsWithoutSize

View File

@@ -4,7 +4,7 @@ const parseTorrent = require('parse-torrent');
const BTClient = require('bittorrent-tracker') const BTClient = require('bittorrent-tracker')
const async = require('async'); const async = require('async');
const decode = require('magnet-uri'); const decode = require('magnet-uri');
const isVideo = require('./video'); const { isVideo, isSubtitle } = require('./extension');
const { cacheTrackers } = require('./cache'); const { cacheTrackers } = require('./cache');
const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_best.txt'; const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_best.txt';
@@ -49,13 +49,15 @@ module.exports.updateTorrentSize = function (torrent) {
module.exports.sizeAndFiles = torrent => filesAndSizeFromTorrentStream(torrent, 30000); module.exports.sizeAndFiles = torrent => filesAndSizeFromTorrentStream(torrent, 30000);
module.exports.torrentFiles = function (torrent) { module.exports.torrentFiles = function (torrent, timeout) {
return getFilesFromObject(torrent) return getFilesFromObject(torrent)
.catch(() => filesFromTorrentFile(torrent)) .catch(() => filesFromTorrentFile(torrent))
.catch(() => filesFromTorrentStream(torrent)) .catch(() => filesFromTorrentStream(torrent, timeout))
.then((files) => filterVideos(files)) .then(files => ({
.then((files) => filterSamples(files)) contents: files,
.then((files) => filterExtras(files)); videos: filterVideos(files),
subtitles: filterSubtitles(files)
}));
}; };
function getFilesFromObject(torrent) { function getFilesFromObject(torrent) {
@@ -86,11 +88,11 @@ async function filesFromTorrentFile(torrent) {
}))); })));
} }
async function filesFromTorrentStream(torrent) { async function filesFromTorrentStream(torrent, timeout) {
return filesAndSizeFromTorrentStream(torrent, 60000).then(result => result.files); return filesAndSizeFromTorrentStream(torrent, timeout).then(result => result.files);
} }
function filesAndSizeFromTorrentStream(torrent, timeout = 60000) { function filesAndSizeFromTorrentStream(torrent, timeout = 30000) {
if (!torrent.infoHash && !torrent.magnetLink) { if (!torrent.infoHash && !torrent.magnetLink) {
return Promise.reject(new Error("no infoHash or magnetLink")); return Promise.reject(new Error("no infoHash or magnetLink"));
} }
@@ -119,18 +121,17 @@ function filesAndSizeFromTorrentStream(torrent, timeout = 60000) {
} }
function filterVideos(files) { function filterVideos(files) {
return files.filter((file) => isVideo(file.path));
}
function filterSamples(files) {
const maxSize = Math.max(...files.map(file => file.size)); const maxSize = Math.max(...files.map(file => file.size));
const isSample = file => file.name.match(/sample/i) && maxSize / parseInt(file.size) > 10; const isSample = file => file.name.match(/sample/i) && maxSize / parseInt(file.size) > 10;
return files.filter(file => !isSample(file)); const isExtra = file => file.path.match(/extras?\//i);
return files
.filter(file => isVideo(file.path))
.filter(file => !isSample(file))
.filter(file => !isExtra(file));
} }
function filterExtras(files) { function filterSubtitles(files) {
const isExtra = file => file.path.match(/extras?\//i); return files.filter(file => isSubtitle(file.path));
return files.filter(file => !isExtra(file));
} }
async function getDefaultTrackers() { async function getDefaultTrackers() {

View File

@@ -33,21 +33,26 @@ async function createTorrentEntry(torrent, overwrite = false) {
return; return;
} }
const files = await parseTorrentFiles(torrent) const { contents, videos, subtitles } = await parseTorrentFiles(torrent)
.then(files => overwrite ? overwriteExistingFiles(torrent, files) : files); .then(torrentContents => overwrite ? overwriteExistingFiles(torrent, torrentContents) : torrentContents)
if (!files || !files.length) { .catch(error => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return {};
});
if (!videos || !videos.length) {
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`); console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
return; return;
} }
return repository.createTorrent(torrent) return repository.createTorrent({ ...torrent, contents, subtitles })
.then(() => Promise.all(files.map(file => repository.createFile(file)))) .then(() => Promise.all(videos.map(video => repository.createFile(video))))
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`)); .then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
} }
async function overwriteExistingFiles(torrent, files) { async function overwriteExistingFiles(torrent, torrentContents) {
if (files && files.length) { const videos = torrentContents && torrentContents.videos;
const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash }) if (videos && videos.length) {
const existingFiles = await repository.getFiles({ infoHash: videos[0].infoHash })
.then((existing) => existing .then((existing) => existing
.reduce((map, next) => { .reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null; const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
@@ -56,9 +61,9 @@ async function overwriteExistingFiles(torrent, files) {
}, {})) }, {}))
.catch(() => undefined); .catch(() => undefined);
if (existingFiles && Object.keys(existingFiles).length) { if (existingFiles && Object.keys(existingFiles).length) {
return files const overwrittenVideos = videos
.map(file => { .map(file => {
const mapping = files.length === 1 && Object.keys(existingFiles).length === 1 const mapping = videos.length === 1 && Object.keys(existingFiles).length === 1
? Object.values(existingFiles)[0] ? Object.values(existingFiles)[0]
: existingFiles[file.fileIndex !== undefined ? file.fileIndex : null]; : existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
if (mapping) { if (mapping) {
@@ -66,9 +71,10 @@ async function overwriteExistingFiles(torrent, files) {
return { ...file, id: originalFile.id, size: originalFile.size || file.size }; return { ...file, id: originalFile.id, size: originalFile.size || file.size };
} }
return file; return file;
}) });
return { ...torrentContents, videos: overwrittenVideos };
} }
return files; return torrentContents;
} }
return Promise.reject(`No video files found for: ${torrent.title}`); return Promise.reject(`No video files found for: ${torrent.title}`);
} }

View File

@@ -3,7 +3,7 @@ const distance = require('jaro-winkler');
const { parse } = require('parse-torrent-title'); const { parse } = require('parse-torrent-title');
const Promises = require('../lib/promises'); const Promises = require('../lib/promises');
const { torrentFiles } = require('../lib/torrent'); const { torrentFiles } = require('../lib/torrent');
const { getMetadata, getImdbId } = require('../lib/metadata'); const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata');
const { Type } = require('./types'); const { Type } = require('./types');
const MIN_SIZE = 10 * 1024 * 1024; // 10 MB const MIN_SIZE = 10 * 1024 * 1024; // 10 MB
@@ -23,62 +23,78 @@ async function parseTorrentFiles(torrent) {
torrent.type = Type.MOVIE; torrent.type = Type.MOVIE;
} }
if (torrent.type === Type.MOVIE && !parsedTorrentName.seasons || metadata && metadata.type === Type.MOVIE) { if (torrent.type === Type.MOVIE && (!parsedTorrentName.seasons ||
if (parsedTorrentName.complete || typeof parsedTorrentName.year === 'string') { parsedTorrentName.season === 5 && parsedTorrentName.episode === 1)) {
return torrentFiles(torrent) return parseMovieFiles(torrent, parsedTorrentName, metadata);
.then(files => files.filter(file => file.size > MIN_SIZE))
.then(files => Promises.sequence(files
.map((file) => () => findMovieImdbId(file.name)
.then((newImdbId) => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: newImdbId,
})))))
.catch(error => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return [];
});
}
return [{
infoHash: torrent.infoHash,
title: torrent.title,
size: torrent.size,
imdbId: torrent.imdbId || metadata && metadata.imdbId,
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
}];
} }
return getSeriesFiles(torrent, parsedTorrentName) return parseSeriesFiles(torrent, parsedTorrentName, metadata)
}
async function parseMovieFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getMoviesTorrentContent(torrent, parsedName);
const filteredVideos = videos.filter(file => file.size > MIN_SIZE);
if (filteredVideos.length === 1) {
const parsedVideo = {
infoHash: torrent.infoHash,
fileIndex: filteredVideos[0].fileIndex,
title: filteredVideos[0].path || torrent.title,
size: filteredVideos[0].size || torrent.size,
imdbId: torrent.imdbId || metadata && metadata.imdbId,
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
};
return { contents, videos: [parsedVideo], subtitles };
}
const parsedVideos = await Promises.sequence(filteredVideos
.map(file => () => findMovieImdbId(file.name)
.then(newImdbId => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: newImdbId,
}))));
return { contents, videos: parsedVideos, subtitles };
}
async function parseSeriesFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent, parsedName);
const parsedVideos = await Promise.resolve(videos)
.then((files) => files .then((files) => files
.filter((file) => file.size > MIN_SIZE) .filter((file) => file.size > MIN_SIZE)
.map((file) => parseSeriesFile(file, parsedTorrentName))) .map((file) => parseSeriesFile(file, parsedName, torrent.type)))
.then((files) => decomposeEpisodes(torrent, files, metadata)) .then((files) => decomposeEpisodes(torrent, files, metadata))
.then((files) => assignKitsuOrImdbEpisodes(torrent, files, metadata)) .then((files) => assignKitsuOrImdbEpisodes(torrent, files, metadata))
.then((files) => Promise.all(files.map(file => file.isMovie .then((files) => Promise.all(files.map(file => file.isMovie
? mapSeriesMovie(file, torrent) ? mapSeriesMovie(file, torrent)
: mapSeriesEpisode(file, torrent, files)))) : mapSeriesEpisode(file, torrent, files))))
.then((files) => files.reduce((a, b) => a.concat(b), [])) .then((files) => files.reduce((a, b) => a.concat(b), []))
.catch((error) => { return { contents, videos: parsedVideos, subtitles };
console.log(`Failed getting files for ${torrent.title}`, error.message); }
return [];
async function getMoviesTorrentContent(torrent, parsedName) {
const hasMultipleMovie = parsedName.complete || typeof parsedName.year === 'string';
return torrentFiles(torrent)
.catch(error => {
if (!hasMultipleMovie) {
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}
return Promise.reject(error);
}); });
} }
async function getSeriesFiles(torrent, parsedName) { async function getSeriesTorrentContent(torrent, parsedName) {
const hasMultipleEpisodes = parsedName.complete || parsedName.hasMovies || torrent.size > MULTIPLE_FILES_SIZE || const hasMultipleEpisodes = parsedName.complete || parsedName.hasMovies || torrent.size > MULTIPLE_FILES_SIZE ||
(parsedName.seasons && parsedName.seasons.length > 1); (parsedName.seasons && parsedName.seasons.length > 1);
if (!hasMultipleEpisodes && (Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date))) { const hasSingleEpisode = Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date);
return [{ return torrentFiles(torrent)
name: torrent.title, .catch(error => {
path: torrent.title, if (!hasMultipleEpisodes && hasSingleEpisode) {
size: torrent.size return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}]; }
} return Promise.reject(error);
});
return torrentFiles(torrent);
} }
async function mapSeriesEpisode(file, torrent, files) { async function mapSeriesEpisode(file, torrent, files) {
@@ -109,19 +125,20 @@ async function mapSeriesEpisode(file, torrent, files) {
} }
async function mapSeriesMovie(file, torrent) { async function mapSeriesMovie(file, torrent) {
return findMovieImdbId(file) const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined;
.then(imdbId => getMetadata(imdbId, Type.MOVIE).catch(() => ({ imdbId }))) const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined;
.then(metadata => [{ const metadata = getMetadata(imdbId, Type.MOVIE).catch(() => undefined);
infoHash: torrent.infoHash, return [{
fileIndex: file.fileIndex, infoHash: torrent.infoHash,
title: file.path || file.name, fileIndex: file.fileIndex,
size: file.size, title: file.path || file.name,
imdbId: metadata.imdbId, size: file.size,
kitsuId: metadata.kitsuId imdbId: metadata && metadata.imdbId || imdbId,
}]); kitsuId: metadata && metadata.kitsuId || kitsuId
}];
} }
function parseSeriesFile(file, parsedTorrentName) { function parseSeriesFile(file, parsedTorrentName, type) {
const fileInfo = parse(file.name); const fileInfo = parse(file.name);
// the episode may be in a folder containing season number // the episode may be in a folder containing season number
if (!fileInfo.season && file.path.includes('/')) { if (!fileInfo.season && file.path.includes('/')) {
@@ -143,7 +160,8 @@ function parseSeriesFile(file, parsedTorrentName) {
fileInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)]; fileInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
fileInfo.episode = fileInfo.episodes && fileInfo.episodes[0]; fileInfo.episode = fileInfo.episodes && fileInfo.episodes[0];
} }
fileInfo.isMovie = (parsedTorrentName.hasMovies && !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year)) fileInfo.isMovie = ((parsedTorrentName.hasMovies || type === Type.ANIME)
&& !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year))
|| (!fileInfo.season && !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i)); || (!fileInfo.season && !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i));
return { ...file, ...fileInfo }; return { ...file, ...fileInfo };
@@ -351,11 +369,16 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
return files; return files;
} }
function findMovieImdbId(title) { function findMovieImdbId(title, type) {
const parsedTitle = typeof title === 'string' ? parse(title) : title; const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined); return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined);
} }
function findMovieKitsuId(title) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getKitsuId(parsedTitle, Type.MOVIE).catch(() => undefined);
}
function div100(episode) { function div100(episode) {
return (episode / 100 >> 0); // floor to nearest int return (episode / 100 >> 0); // floor to nearest int
} }