[scraper] saves opened torrent contents and videos and subs separately

This commit is contained in:
TheBeastLT
2020-05-02 10:16:03 +02:00
parent 4cf7bfb04a
commit 14b2f6b1a2
8 changed files with 224 additions and 94 deletions

4
package-lock.json generated
View File

@@ -1687,8 +1687,8 @@
}
},
"parse-torrent-title": {
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6",
"version": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"from": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"requires": {
"moment": "^2.24.0"
}

View File

@@ -31,7 +31,7 @@
"node-schedule": "^1.3.2",
"nodejs-bing": "^0.1.0",
"parse-torrent": "^6.1.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d",
"pg": "^7.8.2",
"pg-hstore": "^2.3.2",
"rarbg-api": "^1.1.3",

52
scraper/lib/extension.js Normal file
View File

@@ -0,0 +1,52 @@
const VIDEO_EXTENSIONS = [
"3g2",
"3gp",
"avi",
"flv",
"mkv",
"mov",
"mp2",
"mp4",
"m4v",
"mpe",
"mpeg",
"mpg",
"mpv",
"webm",
"wmv",
"ogm"
];
const SUBTITLE_EXTENSIONS = [
"aqt",
"gsub",
"jss",
"sub",
"ttxt",
"pjs",
"psb",
"rt",
"smi",
"slt",
"ssf",
"srt",
"ssa",
"ass",
"usf",
"idx",
"vtt"
];
function isVideo(filename) {
return isExtension(filename, VIDEO_EXTENSIONS);
}
function isSubtitle(filename) {
return isExtension(filename, SUBTITLE_EXTENSIONS);
}
function isExtension(filename, extensions) {
const extensionMatch = filename.match(/\.(\w{2,4})$/);
return extensionMatch && extensions.includes(extensionMatch[1].toLowerCase());
}
module.exports = { isVideo, isSubtitle }

View File

@@ -137,4 +137,4 @@ async function getKitsuId(info) {
}));
}
module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML };
module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML, escapeTitle };

View File

@@ -31,7 +31,7 @@ const Torrent = database.define('torrent',
languages: { type: Sequelize.STRING(256) },
resolution: { type: Sequelize.STRING(16) },
reviewed: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false },
subsChecked: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }
opened: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }
}
);
@@ -104,12 +104,36 @@ const Subtitle = database.define('subtitle',
}
);
const Content = database.define('content',
{
infoHash: {
type: Sequelize.STRING(64),
primaryKey: true,
allowNull: false,
references: { model: Torrent, key: 'infoHash' },
onDelete: 'CASCADE'
},
fileIndex: {
type: Sequelize.INTEGER,
primaryKey: true,
allowNull: false
},
path: { type: Sequelize.STRING(512), allowNull: false },
size: { type: Sequelize.BIGINT },
},
{
timestamps: false,
}
);
const SkipTorrent = database.define('skip_torrent', {
infoHash: { type: Sequelize.STRING(64), primaryKey: true },
});
Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false });
File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
Torrent.hasMany(Content, { foreignKey: 'infoHash', constraints: false });
Content.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false });
Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false });
@@ -163,7 +187,9 @@ function getUpdateSeedersTorrents() {
}
function createTorrent(torrent) {
return Torrent.upsert(torrent);
return Torrent.upsert(torrent)
.then(() => createContents(torrent.infoHash, torrent.contents))
.then(() => createSubtitles(torrent.infoHash, torrent.subtitles));
}
function setTorrentSeeders(infoHash, seeders) {
@@ -174,7 +200,7 @@ function setTorrentSeeders(infoHash, seeders) {
}
function createFile(file) {
return File.upsert(file, { include: [Subtitle] });
return File.create(file, { include: [Subtitle] });
}
function getFiles(torrent) {
@@ -189,8 +215,27 @@ function deleteFile(file) {
return File.destroy({ where: { id: file.id } })
}
function createSubtitle(subtitle) {
return Subtitle.upsert(subtitle);
function createSubtitles(infoHash, subtitles) {
if (subtitles && subtitles.length) {
return Subtitle.bulkCreate(subtitles.map(subtitle => ({ infoHash, title: subtitle.path, ...subtitle })));
}
return Promise.resolve();
}
function getSubtitles(torrent) {
return Subtitle.findAll({ where: { infoHash: torrent.infoHash } });
}
function createContents(infoHash, contents) {
if (contents && contents.length) {
return Content.bulkCreate(contents.map(content => ({ infoHash, ...content })))
.then(() => Torrent.update({ opened: true }, { where: { infoHash: infoHash } }));
}
return Promise.resolve();
}
function getContents(torrent) {
return Content.findAll({ where: { infoHash: torrent.infoHash } });
}
function getSkipTorrent(torrent) {
@@ -219,7 +264,10 @@ module.exports = {
getFiles,
getFilesBasedOnTitle,
deleteFile,
createSubtitle,
createSubtitles,
getSubtitles,
createContents,
getContents,
getSkipTorrent,
createSkipTorrent,
getTorrentsWithoutSize

View File

@@ -4,7 +4,7 @@ const parseTorrent = require('parse-torrent');
const BTClient = require('bittorrent-tracker')
const async = require('async');
const decode = require('magnet-uri');
const isVideo = require('./video');
const { isVideo, isSubtitle } = require('./extension');
const { cacheTrackers } = require('./cache');
const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_best.txt';
@@ -49,13 +49,15 @@ module.exports.updateTorrentSize = function (torrent) {
module.exports.sizeAndFiles = torrent => filesAndSizeFromTorrentStream(torrent, 30000);
module.exports.torrentFiles = function (torrent) {
module.exports.torrentFiles = function (torrent, timeout) {
return getFilesFromObject(torrent)
.catch(() => filesFromTorrentFile(torrent))
.catch(() => filesFromTorrentStream(torrent))
.then((files) => filterVideos(files))
.then((files) => filterSamples(files))
.then((files) => filterExtras(files));
.catch(() => filesFromTorrentStream(torrent, timeout))
.then(files => ({
contents: files,
videos: filterVideos(files),
subtitles: filterSubtitles(files)
}));
};
function getFilesFromObject(torrent) {
@@ -86,11 +88,11 @@ async function filesFromTorrentFile(torrent) {
})));
}
async function filesFromTorrentStream(torrent) {
return filesAndSizeFromTorrentStream(torrent, 60000).then(result => result.files);
async function filesFromTorrentStream(torrent, timeout) {
return filesAndSizeFromTorrentStream(torrent, timeout).then(result => result.files);
}
function filesAndSizeFromTorrentStream(torrent, timeout = 60000) {
function filesAndSizeFromTorrentStream(torrent, timeout = 30000) {
if (!torrent.infoHash && !torrent.magnetLink) {
return Promise.reject(new Error("no infoHash or magnetLink"));
}
@@ -119,18 +121,17 @@ function filesAndSizeFromTorrentStream(torrent, timeout = 60000) {
}
function filterVideos(files) {
return files.filter((file) => isVideo(file.path));
}
function filterSamples(files) {
const maxSize = Math.max(...files.map(file => file.size));
const isSample = file => file.name.match(/sample/i) && maxSize / parseInt(file.size) > 10;
return files.filter(file => !isSample(file));
const isExtra = file => file.path.match(/extras?\//i);
return files
.filter(file => isVideo(file.path))
.filter(file => !isSample(file))
.filter(file => !isExtra(file));
}
function filterExtras(files) {
const isExtra = file => file.path.match(/extras?\//i);
return files.filter(file => !isExtra(file));
function filterSubtitles(files) {
return files.filter(file => isSubtitle(file.path));
}
async function getDefaultTrackers() {

View File

@@ -33,21 +33,26 @@ async function createTorrentEntry(torrent, overwrite = false) {
return;
}
const files = await parseTorrentFiles(torrent)
.then(files => overwrite ? overwriteExistingFiles(torrent, files) : files);
if (!files || !files.length) {
const { contents, videos, subtitles } = await parseTorrentFiles(torrent)
.then(torrentContents => overwrite ? overwriteExistingFiles(torrent, torrentContents) : torrentContents)
.catch(error => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return {};
});
if (!videos || !videos.length) {
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
return;
}
return repository.createTorrent(torrent)
.then(() => Promise.all(files.map(file => repository.createFile(file))))
return repository.createTorrent({ ...torrent, contents, subtitles })
.then(() => Promise.all(videos.map(video => repository.createFile(video))))
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
}
async function overwriteExistingFiles(torrent, files) {
if (files && files.length) {
const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash })
async function overwriteExistingFiles(torrent, torrentContents) {
const videos = torrentContents && torrentContents.videos;
if (videos && videos.length) {
const existingFiles = await repository.getFiles({ infoHash: videos[0].infoHash })
.then((existing) => existing
.reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
@@ -56,9 +61,9 @@ async function overwriteExistingFiles(torrent, files) {
}, {}))
.catch(() => undefined);
if (existingFiles && Object.keys(existingFiles).length) {
return files
const overwrittenVideos = videos
.map(file => {
const mapping = files.length === 1 && Object.keys(existingFiles).length === 1
const mapping = videos.length === 1 && Object.keys(existingFiles).length === 1
? Object.values(existingFiles)[0]
: existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
if (mapping) {
@@ -66,9 +71,10 @@ async function overwriteExistingFiles(torrent, files) {
return { ...file, id: originalFile.id, size: originalFile.size || file.size };
}
return file;
})
});
return { ...torrentContents, videos: overwrittenVideos };
}
return files;
return torrentContents;
}
return Promise.reject(`No video files found for: ${torrent.title}`);
}

View File

@@ -3,7 +3,7 @@ const distance = require('jaro-winkler');
const { parse } = require('parse-torrent-title');
const Promises = require('../lib/promises');
const { torrentFiles } = require('../lib/torrent');
const { getMetadata, getImdbId } = require('../lib/metadata');
const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata');
const { Type } = require('./types');
const MIN_SIZE = 10 * 1024 * 1024; // 10 MB
@@ -23,62 +23,78 @@ async function parseTorrentFiles(torrent) {
torrent.type = Type.MOVIE;
}
if (torrent.type === Type.MOVIE && !parsedTorrentName.seasons || metadata && metadata.type === Type.MOVIE) {
if (parsedTorrentName.complete || typeof parsedTorrentName.year === 'string') {
return torrentFiles(torrent)
.then(files => files.filter(file => file.size > MIN_SIZE))
.then(files => Promises.sequence(files
.map((file) => () => findMovieImdbId(file.name)
.then((newImdbId) => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: newImdbId,
})))))
.catch(error => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return [];
});
}
return [{
infoHash: torrent.infoHash,
title: torrent.title,
size: torrent.size,
imdbId: torrent.imdbId || metadata && metadata.imdbId,
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
}];
if (torrent.type === Type.MOVIE && (!parsedTorrentName.seasons ||
parsedTorrentName.season === 5 && parsedTorrentName.episode === 1)) {
return parseMovieFiles(torrent, parsedTorrentName, metadata);
}
return getSeriesFiles(torrent, parsedTorrentName)
return parseSeriesFiles(torrent, parsedTorrentName, metadata)
}
async function parseMovieFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getMoviesTorrentContent(torrent, parsedName);
const filteredVideos = videos.filter(file => file.size > MIN_SIZE);
if (filteredVideos.length === 1) {
const parsedVideo = {
infoHash: torrent.infoHash,
fileIndex: filteredVideos[0].fileIndex,
title: filteredVideos[0].path || torrent.title,
size: filteredVideos[0].size || torrent.size,
imdbId: torrent.imdbId || metadata && metadata.imdbId,
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
};
return { contents, videos: [parsedVideo], subtitles };
}
const parsedVideos = await Promises.sequence(filteredVideos
.map(file => () => findMovieImdbId(file.name)
.then(newImdbId => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: newImdbId,
}))));
return { contents, videos: parsedVideos, subtitles };
}
async function parseSeriesFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent, parsedName);
const parsedVideos = await Promise.resolve(videos)
.then((files) => files
.filter((file) => file.size > MIN_SIZE)
.map((file) => parseSeriesFile(file, parsedTorrentName)))
.map((file) => parseSeriesFile(file, parsedName, torrent.type)))
.then((files) => decomposeEpisodes(torrent, files, metadata))
.then((files) => assignKitsuOrImdbEpisodes(torrent, files, metadata))
.then((files) => Promise.all(files.map(file => file.isMovie
? mapSeriesMovie(file, torrent)
: mapSeriesEpisode(file, torrent, files))))
.then((files) => files.reduce((a, b) => a.concat(b), []))
.catch((error) => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return [];
return { contents, videos: parsedVideos, subtitles };
}
async function getMoviesTorrentContent(torrent, parsedName) {
const hasMultipleMovie = parsedName.complete || typeof parsedName.year === 'string';
return torrentFiles(torrent)
.catch(error => {
if (!hasMultipleMovie) {
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}
return Promise.reject(error);
});
}
async function getSeriesFiles(torrent, parsedName) {
async function getSeriesTorrentContent(torrent, parsedName) {
const hasMultipleEpisodes = parsedName.complete || parsedName.hasMovies || torrent.size > MULTIPLE_FILES_SIZE ||
(parsedName.seasons && parsedName.seasons.length > 1);
if (!hasMultipleEpisodes && (Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date))) {
return [{
name: torrent.title,
path: torrent.title,
size: torrent.size
}];
}
return torrentFiles(torrent);
const hasSingleEpisode = Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date);
return torrentFiles(torrent)
.catch(error => {
if (!hasMultipleEpisodes && hasSingleEpisode) {
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}
return Promise.reject(error);
});
}
async function mapSeriesEpisode(file, torrent, files) {
@@ -109,19 +125,20 @@ async function mapSeriesEpisode(file, torrent, files) {
}
async function mapSeriesMovie(file, torrent) {
return findMovieImdbId(file)
.then(imdbId => getMetadata(imdbId, Type.MOVIE).catch(() => ({ imdbId })))
.then(metadata => [{
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: metadata.imdbId,
kitsuId: metadata.kitsuId
}]);
const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined;
const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined;
const metadata = getMetadata(imdbId, Type.MOVIE).catch(() => undefined);
return [{
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: metadata && metadata.imdbId || imdbId,
kitsuId: metadata && metadata.kitsuId || kitsuId
}];
}
function parseSeriesFile(file, parsedTorrentName) {
function parseSeriesFile(file, parsedTorrentName, type) {
const fileInfo = parse(file.name);
// the episode may be in a folder containing season number
if (!fileInfo.season && file.path.includes('/')) {
@@ -143,7 +160,8 @@ function parseSeriesFile(file, parsedTorrentName) {
fileInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
fileInfo.episode = fileInfo.episodes && fileInfo.episodes[0];
}
fileInfo.isMovie = (parsedTorrentName.hasMovies && !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year))
fileInfo.isMovie = ((parsedTorrentName.hasMovies || type === Type.ANIME)
&& !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year))
|| (!fileInfo.season && !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i));
return { ...file, ...fileInfo };
@@ -351,11 +369,16 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
return files;
}
function findMovieImdbId(title) {
function findMovieImdbId(title, type) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined);
}
function findMovieKitsuId(title) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getKitsuId(parsedTitle, Type.MOVIE).catch(() => undefined);
}
function div100(episode) {
return (episode / 100 >> 0); // floor to nearest int
}