From 14b2f6b1a2a3d433bdaac477d5d588197b142dba Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sat, 2 May 2020 10:16:03 +0200 Subject: [PATCH] [scraper] saves opened torrent contents and videos and subs separately --- package-lock.json | 4 +- package.json | 2 +- scraper/lib/extension.js | 52 +++++++++++++ scraper/lib/metadata.js | 2 +- scraper/lib/repository.js | 60 +++++++++++++-- scraper/lib/torrent.js | 35 ++++----- scraper/lib/torrentEntries.js | 30 +++++--- scraper/lib/torrentFiles.js | 133 ++++++++++++++++++++-------------- 8 files changed, 224 insertions(+), 94 deletions(-) create mode 100644 scraper/lib/extension.js diff --git a/package-lock.json b/package-lock.json index 75f5f97..bf6f8cc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1687,8 +1687,8 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", - "from": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d", + "from": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d", "requires": { "moment": "^2.24.0" } diff --git a/package.json b/package.json index 1122032..933e348 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "node-schedule": "^1.3.2", "nodejs-bing": "^0.1.0", "parse-torrent": "^6.1.2", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#a151662b960b6c826b34245663db12ee4ccc75e6", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#213d188496d5645bcbfff5c3b5b3839df486260d", "pg": "^7.8.2", "pg-hstore": "^2.3.2", "rarbg-api": "^1.1.3", diff --git a/scraper/lib/extension.js b/scraper/lib/extension.js new file mode 100644 index 0000000..99b7523 --- /dev/null +++ b/scraper/lib/extension.js @@ -0,0 +1,52 @@ +const VIDEO_EXTENSIONS = [ + "3g2", + "3gp", + "avi", + "flv", + "mkv", + "mov", + "mp2", + "mp4", + "m4v", + "mpe", + "mpeg", + "mpg", + "mpv", + "webm", + "wmv", + "ogm" +]; +const SUBTITLE_EXTENSIONS = [ + "aqt", + "gsub", + "jss", + "sub", + "ttxt", + "pjs", + "psb", + "rt", + "smi", + "slt", + "ssf", + "srt", + "ssa", + "ass", + "usf", + "idx", + "vtt" +]; + +function isVideo(filename) { + return isExtension(filename, VIDEO_EXTENSIONS); +} + +function isSubtitle(filename) { + return isExtension(filename, SUBTITLE_EXTENSIONS); +} + +function isExtension(filename, extensions) { + const extensionMatch = filename.match(/\.(\w{2,4})$/); + return extensionMatch && extensions.includes(extensionMatch[1].toLowerCase()); +} + +module.exports = { isVideo, isSubtitle } \ No newline at end of file diff --git a/scraper/lib/metadata.js b/scraper/lib/metadata.js index 5a7ecae..fbee6f8 100644 --- a/scraper/lib/metadata.js +++ b/scraper/lib/metadata.js @@ -137,4 +137,4 @@ async function getKitsuId(info) { })); } -module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML }; +module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML, escapeTitle }; diff --git a/scraper/lib/repository.js b/scraper/lib/repository.js index 93d95bb..9a64284 100644 --- a/scraper/lib/repository.js +++ b/scraper/lib/repository.js @@ -31,7 +31,7 @@ const Torrent = database.define('torrent', languages: { type: Sequelize.STRING(256) }, resolution: { type: Sequelize.STRING(16) }, reviewed: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false }, - subsChecked: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false } + opened: { type: Sequelize.BOOLEAN, allowNull: false, defaultValue: false } } ); @@ -104,12 +104,36 @@ const Subtitle = database.define('subtitle', } ); +const Content = database.define('content', + { + infoHash: { + type: Sequelize.STRING(64), + primaryKey: true, + allowNull: false, + references: { model: Torrent, key: 'infoHash' }, + onDelete: 'CASCADE' + }, + fileIndex: { + type: Sequelize.INTEGER, + primaryKey: true, + allowNull: false + }, + path: { type: Sequelize.STRING(512), allowNull: false }, + size: { type: Sequelize.BIGINT }, + }, + { + timestamps: false, + } +); + const SkipTorrent = database.define('skip_torrent', { infoHash: { type: Sequelize.STRING(64), primaryKey: true }, }); Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false }); File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false }); +Torrent.hasMany(Content, { foreignKey: 'infoHash', constraints: false }); +Content.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false }); File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false }); Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false }); @@ -163,7 +187,9 @@ function getUpdateSeedersTorrents() { } function createTorrent(torrent) { - return Torrent.upsert(torrent); + return Torrent.upsert(torrent) + .then(() => createContents(torrent.infoHash, torrent.contents)) + .then(() => createSubtitles(torrent.infoHash, torrent.subtitles)); } function setTorrentSeeders(infoHash, seeders) { @@ -174,7 +200,7 @@ function setTorrentSeeders(infoHash, seeders) { } function createFile(file) { - return File.upsert(file, { include: [Subtitle] }); + return File.create(file, { include: [Subtitle] }); } function getFiles(torrent) { @@ -189,8 +215,27 @@ function deleteFile(file) { return File.destroy({ where: { id: file.id } }) } -function createSubtitle(subtitle) { - return Subtitle.upsert(subtitle); +function createSubtitles(infoHash, subtitles) { + if (subtitles && subtitles.length) { + return Subtitle.bulkCreate(subtitles.map(subtitle => ({ infoHash, title: subtitle.path, ...subtitle }))); + } + return Promise.resolve(); +} + +function getSubtitles(torrent) { + return Subtitle.findAll({ where: { infoHash: torrent.infoHash } }); +} + +function createContents(infoHash, contents) { + if (contents && contents.length) { + return Content.bulkCreate(contents.map(content => ({ infoHash, ...content }))) + .then(() => Torrent.update({ opened: true }, { where: { infoHash: infoHash } })); + } + return Promise.resolve(); +} + +function getContents(torrent) { + return Content.findAll({ where: { infoHash: torrent.infoHash } }); } function getSkipTorrent(torrent) { @@ -219,7 +264,10 @@ module.exports = { getFiles, getFilesBasedOnTitle, deleteFile, - createSubtitle, + createSubtitles, + getSubtitles, + createContents, + getContents, getSkipTorrent, createSkipTorrent, getTorrentsWithoutSize diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index a26b92d..4c9627e 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -4,7 +4,7 @@ const parseTorrent = require('parse-torrent'); const BTClient = require('bittorrent-tracker') const async = require('async'); const decode = require('magnet-uri'); -const isVideo = require('./video'); +const { isVideo, isSubtitle } = require('./extension'); const { cacheTrackers } = require('./cache'); const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_best.txt'; @@ -49,13 +49,15 @@ module.exports.updateTorrentSize = function (torrent) { module.exports.sizeAndFiles = torrent => filesAndSizeFromTorrentStream(torrent, 30000); -module.exports.torrentFiles = function (torrent) { +module.exports.torrentFiles = function (torrent, timeout) { return getFilesFromObject(torrent) .catch(() => filesFromTorrentFile(torrent)) - .catch(() => filesFromTorrentStream(torrent)) - .then((files) => filterVideos(files)) - .then((files) => filterSamples(files)) - .then((files) => filterExtras(files)); + .catch(() => filesFromTorrentStream(torrent, timeout)) + .then(files => ({ + contents: files, + videos: filterVideos(files), + subtitles: filterSubtitles(files) + })); }; function getFilesFromObject(torrent) { @@ -86,11 +88,11 @@ async function filesFromTorrentFile(torrent) { }))); } -async function filesFromTorrentStream(torrent) { - return filesAndSizeFromTorrentStream(torrent, 60000).then(result => result.files); +async function filesFromTorrentStream(torrent, timeout) { + return filesAndSizeFromTorrentStream(torrent, timeout).then(result => result.files); } -function filesAndSizeFromTorrentStream(torrent, timeout = 60000) { +function filesAndSizeFromTorrentStream(torrent, timeout = 30000) { if (!torrent.infoHash && !torrent.magnetLink) { return Promise.reject(new Error("no infoHash or magnetLink")); } @@ -119,18 +121,17 @@ function filesAndSizeFromTorrentStream(torrent, timeout = 60000) { } function filterVideos(files) { - return files.filter((file) => isVideo(file.path)); -} - -function filterSamples(files) { const maxSize = Math.max(...files.map(file => file.size)); const isSample = file => file.name.match(/sample/i) && maxSize / parseInt(file.size) > 10; - return files.filter(file => !isSample(file)); + const isExtra = file => file.path.match(/extras?\//i); + return files + .filter(file => isVideo(file.path)) + .filter(file => !isSample(file)) + .filter(file => !isExtra(file)); } -function filterExtras(files) { - const isExtra = file => file.path.match(/extras?\//i); - return files.filter(file => !isExtra(file)); +function filterSubtitles(files) { + return files.filter(file => isSubtitle(file.path)); } async function getDefaultTrackers() { diff --git a/scraper/lib/torrentEntries.js b/scraper/lib/torrentEntries.js index 09e799d..1d88b95 100644 --- a/scraper/lib/torrentEntries.js +++ b/scraper/lib/torrentEntries.js @@ -33,21 +33,26 @@ async function createTorrentEntry(torrent, overwrite = false) { return; } - const files = await parseTorrentFiles(torrent) - .then(files => overwrite ? overwriteExistingFiles(torrent, files) : files); - if (!files || !files.length) { + const { contents, videos, subtitles } = await parseTorrentFiles(torrent) + .then(torrentContents => overwrite ? overwriteExistingFiles(torrent, torrentContents) : torrentContents) + .catch(error => { + console.log(`Failed getting files for ${torrent.title}`, error.message); + return {}; + }); + if (!videos || !videos.length) { console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`); return; } - return repository.createTorrent(torrent) - .then(() => Promise.all(files.map(file => repository.createFile(file)))) + return repository.createTorrent({ ...torrent, contents, subtitles }) + .then(() => Promise.all(videos.map(video => repository.createFile(video)))) .then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`)); } -async function overwriteExistingFiles(torrent, files) { - if (files && files.length) { - const existingFiles = await repository.getFiles({ infoHash: files[0].infoHash }) +async function overwriteExistingFiles(torrent, torrentContents) { + const videos = torrentContents && torrentContents.videos; + if (videos && videos.length) { + const existingFiles = await repository.getFiles({ infoHash: videos[0].infoHash }) .then((existing) => existing .reduce((map, next) => { const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null; @@ -56,9 +61,9 @@ async function overwriteExistingFiles(torrent, files) { }, {})) .catch(() => undefined); if (existingFiles && Object.keys(existingFiles).length) { - return files + const overwrittenVideos = videos .map(file => { - const mapping = files.length === 1 && Object.keys(existingFiles).length === 1 + const mapping = videos.length === 1 && Object.keys(existingFiles).length === 1 ? Object.values(existingFiles)[0] : existingFiles[file.fileIndex !== undefined ? file.fileIndex : null]; if (mapping) { @@ -66,9 +71,10 @@ async function overwriteExistingFiles(torrent, files) { return { ...file, id: originalFile.id, size: originalFile.size || file.size }; } return file; - }) + }); + return { ...torrentContents, videos: overwrittenVideos }; } - return files; + return torrentContents; } return Promise.reject(`No video files found for: ${torrent.title}`); } diff --git a/scraper/lib/torrentFiles.js b/scraper/lib/torrentFiles.js index a83aab6..081f44d 100644 --- a/scraper/lib/torrentFiles.js +++ b/scraper/lib/torrentFiles.js @@ -3,7 +3,7 @@ const distance = require('jaro-winkler'); const { parse } = require('parse-torrent-title'); const Promises = require('../lib/promises'); const { torrentFiles } = require('../lib/torrent'); -const { getMetadata, getImdbId } = require('../lib/metadata'); +const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata'); const { Type } = require('./types'); const MIN_SIZE = 10 * 1024 * 1024; // 10 MB @@ -23,62 +23,78 @@ async function parseTorrentFiles(torrent) { torrent.type = Type.MOVIE; } - if (torrent.type === Type.MOVIE && !parsedTorrentName.seasons || metadata && metadata.type === Type.MOVIE) { - if (parsedTorrentName.complete || typeof parsedTorrentName.year === 'string') { - return torrentFiles(torrent) - .then(files => files.filter(file => file.size > MIN_SIZE)) - .then(files => Promises.sequence(files - .map((file) => () => findMovieImdbId(file.name) - .then((newImdbId) => ({ - infoHash: torrent.infoHash, - fileIndex: file.fileIndex, - title: file.path || file.name, - size: file.size, - imdbId: newImdbId, - }))))) - .catch(error => { - console.log(`Failed getting files for ${torrent.title}`, error.message); - return []; - }); - } - - return [{ - infoHash: torrent.infoHash, - title: torrent.title, - size: torrent.size, - imdbId: torrent.imdbId || metadata && metadata.imdbId, - kitsuId: torrent.kitsuId || metadata && metadata.kitsuId - }]; + if (torrent.type === Type.MOVIE && (!parsedTorrentName.seasons || + parsedTorrentName.season === 5 && parsedTorrentName.episode === 1)) { + return parseMovieFiles(torrent, parsedTorrentName, metadata); } - return getSeriesFiles(torrent, parsedTorrentName) + return parseSeriesFiles(torrent, parsedTorrentName, metadata) +} + +async function parseMovieFiles(torrent, parsedName, metadata) { + const { contents, videos, subtitles } = await getMoviesTorrentContent(torrent, parsedName); + const filteredVideos = videos.filter(file => file.size > MIN_SIZE); + if (filteredVideos.length === 1) { + const parsedVideo = { + infoHash: torrent.infoHash, + fileIndex: filteredVideos[0].fileIndex, + title: filteredVideos[0].path || torrent.title, + size: filteredVideos[0].size || torrent.size, + imdbId: torrent.imdbId || metadata && metadata.imdbId, + kitsuId: torrent.kitsuId || metadata && metadata.kitsuId + }; + return { contents, videos: [parsedVideo], subtitles }; + } + + const parsedVideos = await Promises.sequence(filteredVideos + .map(file => () => findMovieImdbId(file.name) + .then(newImdbId => ({ + infoHash: torrent.infoHash, + fileIndex: file.fileIndex, + title: file.path || file.name, + size: file.size, + imdbId: newImdbId, + })))); + return { contents, videos: parsedVideos, subtitles }; +} + +async function parseSeriesFiles(torrent, parsedName, metadata) { + const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent, parsedName); + const parsedVideos = await Promise.resolve(videos) .then((files) => files .filter((file) => file.size > MIN_SIZE) - .map((file) => parseSeriesFile(file, parsedTorrentName))) + .map((file) => parseSeriesFile(file, parsedName, torrent.type))) .then((files) => decomposeEpisodes(torrent, files, metadata)) .then((files) => assignKitsuOrImdbEpisodes(torrent, files, metadata)) .then((files) => Promise.all(files.map(file => file.isMovie ? mapSeriesMovie(file, torrent) : mapSeriesEpisode(file, torrent, files)))) .then((files) => files.reduce((a, b) => a.concat(b), [])) - .catch((error) => { - console.log(`Failed getting files for ${torrent.title}`, error.message); - return []; + return { contents, videos: parsedVideos, subtitles }; +} + +async function getMoviesTorrentContent(torrent, parsedName) { + const hasMultipleMovie = parsedName.complete || typeof parsedName.year === 'string'; + return torrentFiles(torrent) + .catch(error => { + if (!hasMultipleMovie) { + return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] } + } + return Promise.reject(error); }); } -async function getSeriesFiles(torrent, parsedName) { +async function getSeriesTorrentContent(torrent, parsedName) { const hasMultipleEpisodes = parsedName.complete || parsedName.hasMovies || torrent.size > MULTIPLE_FILES_SIZE || (parsedName.seasons && parsedName.seasons.length > 1); - if (!hasMultipleEpisodes && (Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date))) { - return [{ - name: torrent.title, - path: torrent.title, - size: torrent.size - }]; - } - - return torrentFiles(torrent); + const hasSingleEpisode = Number.isInteger(parsedName.episode) || (!parsedName.episodes && parsedName.date); + return torrentFiles(torrent) + .catch(error => { + if (!hasMultipleEpisodes && hasSingleEpisode) { + return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] } + } + return Promise.reject(error); + }); } async function mapSeriesEpisode(file, torrent, files) { @@ -109,19 +125,20 @@ async function mapSeriesEpisode(file, torrent, files) { } async function mapSeriesMovie(file, torrent) { - return findMovieImdbId(file) - .then(imdbId => getMetadata(imdbId, Type.MOVIE).catch(() => ({ imdbId }))) - .then(metadata => [{ - infoHash: torrent.infoHash, - fileIndex: file.fileIndex, - title: file.path || file.name, - size: file.size, - imdbId: metadata.imdbId, - kitsuId: metadata.kitsuId - }]); + const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined; + const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined; + const metadata = getMetadata(imdbId, Type.MOVIE).catch(() => undefined); + return [{ + infoHash: torrent.infoHash, + fileIndex: file.fileIndex, + title: file.path || file.name, + size: file.size, + imdbId: metadata && metadata.imdbId || imdbId, + kitsuId: metadata && metadata.kitsuId || kitsuId + }]; } -function parseSeriesFile(file, parsedTorrentName) { +function parseSeriesFile(file, parsedTorrentName, type) { const fileInfo = parse(file.name); // the episode may be in a folder containing season number if (!fileInfo.season && file.path.includes('/')) { @@ -143,7 +160,8 @@ function parseSeriesFile(file, parsedTorrentName) { fileInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)]; fileInfo.episode = fileInfo.episodes && fileInfo.episodes[0]; } - fileInfo.isMovie = (parsedTorrentName.hasMovies && !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year)) + fileInfo.isMovie = ((parsedTorrentName.hasMovies || type === Type.ANIME) + && !fileInfo.season && (!fileInfo.episodes || !!fileInfo.year)) || (!fileInfo.season && !!file.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i)); return { ...file, ...fileInfo }; @@ -351,11 +369,16 @@ function assignKitsuOrImdbEpisodes(torrent, files, metadata) { return files; } -function findMovieImdbId(title) { +function findMovieImdbId(title, type) { const parsedTitle = typeof title === 'string' ? parse(title) : title; return getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined); } +function findMovieKitsuId(title) { + const parsedTitle = typeof title === 'string' ? parse(title) : title; + return getKitsuId(parsedTitle, Type.MOVIE).catch(() => undefined); +} + function div100(episode) { return (episode / 100 >> 0); // floor to nearest int }