From 0f91c98b849bfe4ec45b5274c706dad80983af59 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sun, 23 Feb 2020 21:10:35 +0100 Subject: [PATCH] refactors scrapers and add kat and unofficial tpb dump scraper --- README.md | 16 ++- index.js | 7 +- lib/metadata.js | 93 +++++++------- lib/repository.js | 3 +- lib/torrent.js | 3 + lib/torrentEntries.js | 47 ++++++++ lib/torrentFiles.js | 12 +- manual/manual.js | 26 +++- package-lock.json | 2 +- scrapers/horriblesubs/horriblesubs_scraper.js | 2 +- scrapers/kickass/kickass_dump_scraper.js | 98 +++++++++++++++ scrapers/thepiratebay/thepiratebay_api.js | 33 +++-- ...craper.js => thepiratebay_dump_scraper.js} | 62 +++------- .../thepiratebay_unofficial_dump_scraper.js | 113 ++++++++++++++++++ 14 files changed, 403 insertions(+), 114 deletions(-) create mode 100644 lib/torrentEntries.js create mode 100644 scrapers/kickass/kickass_dump_scraper.js rename scrapers/thepiratebay/{thepiratebay_scraper.js => thepiratebay_dump_scraper.js} (74%) create mode 100644 scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js diff --git a/README.md b/README.md index e2cb5bb..5809564 100644 --- a/README.md +++ b/README.md @@ -1 +1,15 @@ -# Torrentio Scraper \ No newline at end of file +# Torrentio Scraper + +## Initial dumps + +### The Pirate Bay + +https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA + +https://thepiratebay.org/static/dump/csv/ + +### Kickass + +https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA + +https://web.archive.org/web/20150416071329/http://kickass.to/api \ No newline at end of file diff --git a/index.js b/index.js index e51b924..24c5acb 100644 --- a/index.js +++ b/index.js @@ -2,16 +2,17 @@ require('dotenv').config(); const express = require("express"); const server = express(); const { connect } = require('./lib/repository'); -const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_scraper'); +const thepiratebayScraper = require('./scrapers/thepiratebay/thepiratebay_dump_scraper'); const horribleSubsScraper = require('./scrapers/horriblesubs/horriblesubs_scraper'); +const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); -const providers = [horribleSubsScraper]; +const providers = [thepiratebayDumpScraper]; async function scrape() { providers.forEach((provider) => provider.scrape()); } -server.post('/scrape', function(req, res) { +server.post('/scrape', function (req, res) { scrape(); res.send(200); }); diff --git a/lib/metadata.js b/lib/metadata.js index a911cb5..6f9f57d 100644 --- a/lib/metadata.js +++ b/lib/metadata.js @@ -6,57 +6,63 @@ const { Type } = require('./types'); const CINEMETA_URL = 'https://v3-cinemeta.strem.io'; const KITSU_URL = 'https://anime-kitsu.now.sh'; +const TIMEOUT = 20000; function getMetadata(id, type = Type.SERIES) { const key = id.match(/^\d+$/) ? `kitsu:${id}` : id; const metaType = type === Type.MOVIE ? Type.MOVIE : Type.SERIES; return cacheWrapMetadata(key, - () => needle('get', `${KITSU_URL}/meta/${metaType}/${key}.json`, { open_timeout: 60000 }) - .then((response) => { - const body = response.body; - if (body && body.meta && body.meta.id) { - return { - kitsuId: body.meta.kitsu_id, - imdbId: body.meta.imdb_id, - title: body.meta.name, - year: body.meta.year, - country: body.meta.country, - genres: body.meta.genres, - videos: (body.meta.videos || []) - .map((video) => video.imdbSeason - ? { - season: video.season, - episode: video.episode, - imdbSeason: video.imdbSeason, - imdbEpisode: video.imdbEpisode - } - : { - season: video.season, - episode: video.episode, - kitsuId: video.kitsu_id, - kitsuEpisode: video.kitsuEpisode, - released: video.released - } - ), - episodeCount: Object.values((body.meta.videos || []) - .filter((entry) => entry.season !== 0) - .sort((a, b) => a.season - b.season) - .reduce((map, next) => { - map[next.season] = map[next.season] + 1 || 1; - return map; - }, {})), - totalCount: body.meta.videos && body.meta.videos - .filter((entry) => entry.season !== 0).length - }; - } else { - throw new Error('No search results'); - } - }) + () => _requestMetadata(`${KITSU_URL}/meta/${metaType}/${key}.json`) + .catch(() => _requestMetadata(`${CINEMETA_URL}/meta/${metaType}/${key}.json`)) .catch((error) => { - throw new Error(`failed kitsu query ${kitsuId} due: ${error.message}`); + throw new Error(`failed metadata query ${kitsuId} due: ${error.message}`); })); } +function _requestMetadata(url) { + return needle('get', url, { open_timeout: TIMEOUT }) + .then((response) => { + const body = response.body; + if (body && body.meta && body.meta.id) { + return { + kitsuId: body.meta.kitsu_id, + imdbId: body.meta.imdb_id, + title: body.meta.name, + year: body.meta.year, + country: body.meta.country, + genres: body.meta.genres, + videos: (body.meta.videos || []) + .map((video) => video.imdbSeason + ? { + season: video.season, + episode: video.episode, + imdbSeason: video.imdbSeason, + imdbEpisode: video.imdbEpisode + } + : { + season: video.season, + episode: video.episode, + kitsuId: video.kitsu_id, + kitsuEpisode: video.kitsuEpisode, + released: video.released + } + ), + episodeCount: Object.values((body.meta.videos || []) + .filter((entry) => entry.season !== 0) + .sort((a, b) => a.season - b.season) + .reduce((map, next) => { + map[next.season] = map[next.season] + 1 || 1; + return map; + }, {})), + totalCount: body.meta.videos && body.meta.videos + .filter((entry) => entry.season !== 0).length + }; + } else { + throw new Error('No search results'); + } + }); +} + function escapeTitle(title, hyphenEscape = true) { return title.toLowerCase() .normalize('NFKD') // normalize non-ASCII characters @@ -86,7 +92,8 @@ async function getImdbId(info) { .match(/imdb\.com\/title\/(tt\d+)/)[1]))); } -async function getKitsuId(title) { +async function getKitsuId(info) { + const title = info.season > 1 ? `${info.name} S${info.season}` : info.name; const query = title.replace(/[;]+/g, ' ').replace(/[,%']+/g, ''); return cacheWrapImdbId(query, () => needle('get', `${KITSU_URL}/catalog/series/kitsu-anime-list/search=${query}.json`, { open_timeout: 60000 }) diff --git a/lib/repository.js b/lib/repository.js index 53ace44..71e0ddc 100644 --- a/lib/repository.js +++ b/lib/repository.js @@ -13,6 +13,7 @@ const Provider = database.define('provider', { const Torrent = database.define('torrent', { infoHash: { type: Sequelize.STRING(64), primaryKey: true }, provider: { type: Sequelize.STRING(32), allowNull: false }, + torrentId: { type: Sequelize.STRING(128) }, title: { type: Sequelize.STRING(256), allowNull: false }, size: { type: Sequelize.BIGINT }, type: { type: Sequelize.STRING(16), allowNull: false }, @@ -42,7 +43,7 @@ const File = database.define('file', { indexes: [ { unique: true, fields: ['infoHash'], where: { fileIndex: { [Op.eq]: null } } }, - { unique: true, fields: ['infoHash', 'fileIndex', 'imdbEpisode'] }, + { unique: true, fields: ['infoHash', 'fileIndex', 'imdbSeason', 'imdbEpisode'] }, { unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] }, { unique: false, fields: ['kitsuId', 'kitsuEpisode'] } ] diff --git a/lib/torrent.js b/lib/torrent.js index 62d42d1..bf88460 100644 --- a/lib/torrent.js +++ b/lib/torrent.js @@ -76,6 +76,9 @@ async function filesFromTorrentStream(torrent) { if (!torrent.infoHash && !torrent.magnetLink) { return Promise.reject(new Error("no infoHash or magnetLink")); } + if (torrent.seeders === 0) { + return Promise.reject(new Error("no seeders for the torrent")); + } return new Promise((resolve, rejected) => { const engine = new torrentStream(torrent.magnetLink || torrent.infoHash, { connections: MAX_PEER_CONNECTIONS }); diff --git a/lib/torrentEntries.js b/lib/torrentEntries.js new file mode 100644 index 0000000..bb9b9fa --- /dev/null +++ b/lib/torrentEntries.js @@ -0,0 +1,47 @@ +const { parse } = require('parse-torrent-title'); +const { Type } = require('./types'); +const repository = require('./repository'); +const { getImdbId, getKitsuId, escapeTitle } = require('./metadata'); +const { parseTorrentFiles } = require('./torrentFiles'); + +async function createTorrentEntry(torrent) { + const titleInfo = parse(torrent.title); + const searchTitle = escapeTitle(titleInfo.title).toLowerCase(); + + if (!torrent.imdbId && torrent.type !== Type.ANIME) { + torrent.imdbId = await getImdbId({ name: searchTitle, year: titleInfo.year, type: torrent.type }) + .catch(() => undefined); + } + if (!torrent.kitsuId && torrent.type === Type.ANIME) { + torrent.kitsuId = await getKitsuId({ name: searchTitle, season: titleInfo.season }) + .catch(() => undefined); + } + + if (!torrent.imdbId && !torrent.kitsuId && !titleInfo.complete) { + console.log(`imdbId or kitsuId not found: ${torrent.title}`); + repository.createFailedImdbTorrent(torrent); + return; + } + + const files = await parseTorrentFiles(torrent); + if (!files || !files.length) { + console.log(`no video files found: ${torrent.title}`); + return; + } + + repository.createTorrent(torrent) + .then(() => files.forEach(file => repository.createFile(file))) + .then(() => console.log(`Created entry for ${torrent.title}`)); +} + +async function createSkipTorrentEntry(torrent) { + return repository.createSkipTorrent(torrent); +} + +async function getStoredTorrentEntry(torrent) { + return repository.getSkipTorrent(torrent) + .catch(() => repository.getTorrent(torrent)) + .catch(() => undefined); +} + +module.exports = { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry }; diff --git a/lib/torrentFiles.js b/lib/torrentFiles.js index 5bf6ccc..deb3eb0 100644 --- a/lib/torrentFiles.js +++ b/lib/torrentFiles.js @@ -116,7 +116,7 @@ function parseSeriesFile(file, parsedTorrentName) { return { ...file, ...fileInfo }; } -async function decomposeEpisodes(torrent, files, metadata = { episodeCount: {} }) { +async function decomposeEpisodes(torrent, files, metadata = { episodeCount: [] }) { if (files.every(file => !file.episodes && !file.date)) { return files; } @@ -142,7 +142,7 @@ async function decomposeEpisodes(torrent, files, metadata = { episodeCount: {} } && file.episodes.every(ep => metadata.episodeCount[file.season - 1] < ep))) && (sortedEpisodes.length <= 1 || sortedEpisodes.slice(1).every((ep, i) => ep - sortedEpisodes[i] <= 2))) { decomposeAbsoluteEpisodeFiles(torrent, files, metadata); - } else if (files.every(file => !file.season && file.date)) { + } else if (files.every(file => (!file.season || !metadata.episodeCount[file.season - 1]) && file.date)) { decomposeDateEpisodeFiles(torrent, files, metadata); } @@ -167,6 +167,14 @@ function decomposeConcatSeasonAndEpisodeFiles(torrent, files, metadata) { } function decomposeAbsoluteEpisodeFiles(torrent, files, metadata) { + if (metadata.episodeCount.length === 0) { + files + .filter(file => !file.season && file.episodes && !file.isMovie) + .forEach(file => { + file.season = 1; + }); + return; + } files .filter(file => file.episodes && !file.isMovie) .forEach(file => { diff --git a/manual/manual.js b/manual/manual.js index cdca616..15afb50 100644 --- a/manual/manual.js +++ b/manual/manual.js @@ -1,4 +1,5 @@ require('dotenv').config(); +const { parse } = require('parse-torrent-title'); const repository = require('../lib/repository'); const { parseTorrentFiles } = require('../lib/torrentFiles'); const { Type } = require('../lib/types'); @@ -18,8 +19,10 @@ async function addMissingEpisodes() { title: file.name, size: file.size, imdbId: imdbId, - imdbSeason: parseInt(file.name.match(/(\d+)[ .]?-[ .]?\d+/)[1], 10), - imdbEpisode: parseInt(file.name.match(/\d+[ .]?-[ .]?(\d+)/)[1], 10), + imdbSeason: parse(file.name).season, + imdbEpisode: parse(file.name).episode, + // imdbSeason: parseInt(file.name.match(/(\d+)[ .]?-[ .]?\d+/)[1], 10), + // imdbEpisode: parseInt(file.name.match(/\d+[ .]?-[ .]?(\d+)/)[1], 10), })) .forEach((file) => repository.createFile(file)); } @@ -70,10 +73,25 @@ async function findAllFiles() { // type: Type.SERIES, // imdbId: 'tt3444938' // }; + /* Not all seasons available so Date based episode */ + // const torrent = { + // infoHash: 'DCD5ACF85F4203FE14428A890528B2EDBD07B092', + // title: 'The Young And The Restless - S43 E10986 - 2016-08-12', + // size: 989777743, + // type: Type.SERIES, + // imdbId: 'tt0069658' + // }; + // const torrent = { + // infoHash: 'C75FBDCD62EB882746A0E58B19BADD60DE14526B', + // title: 'Jimmy.Kimmel.2016.08.03.Hugh.Grant.480p.x264-mSD', + // size: 618637331, + // type: Type.SERIES, + // imdbId: 'tt0320037' + // }; return parseTorrentFiles(torrent) .then((files) => console.log(files)); } -//addMissingEpisodes().then(() => console.log('Finished')); -findAllFiles().then(() => console.log('Finished')); \ No newline at end of file +addMissingEpisodes().then(() => console.log('Finished')); +//findAllFiles().then(() => console.log('Finished')); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index fe1b312..82c3672 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1714,7 +1714,7 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#bfa710a62818723049b39869756bb198056ddde3", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#0da8a2caeeadb7e20317cd33ad2fc647e4e53d70", "from": "git://github.com/TheBeastLT/parse-torrent-title.git#master" }, "parseurl": { diff --git a/scrapers/horriblesubs/horriblesubs_scraper.js b/scrapers/horriblesubs/horriblesubs_scraper.js index b2566ea..fd40dcb 100644 --- a/scrapers/horriblesubs/horriblesubs_scraper.js +++ b/scrapers/horriblesubs/horriblesubs_scraper.js @@ -55,7 +55,7 @@ async function enrichShow(show) { console.log(`${NAME}: getting show info for ${show.title}...`); const showId = await horriblesubs._getShowId(show.url) .catch((error) => show.title); - const metadata = await getKitsuId(show.title) + const metadata = await getKitsuId({ name: show.title }) .then((kitsuId) => getMetadata(kitsuId)) .catch((error) => { console.log(`Failed getting kitsu meta: ${error.message}`); diff --git a/scrapers/kickass/kickass_dump_scraper.js b/scrapers/kickass/kickass_dump_scraper.js new file mode 100644 index 0000000..af947a1 --- /dev/null +++ b/scrapers/kickass/kickass_dump_scraper.js @@ -0,0 +1,98 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const LineByLineReader = require('line-by-line'); +const fs = require('fs'); +const { Type } = require('../../lib/types'); +const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); + +const NAME = 'KickassTorrents'; +const CSV_FILE_PATH = '/tmp/kickass.csv'; + +const limiter = new Bottleneck({ maxConcurrent: 40 }); + +async function scrape() { + console.log(`starting to scrape KAT dump: ${JSON.stringify(lastDump)}`); + + let entriesProcessed = 0; + const lr = new LineByLineReader(CSV_FILE_PATH); + lr.on('line', (line) => { + if (entriesProcessed % 1000 === 0) { + console.log(`Processed ${entriesProcessed} entries`); + } + const row = line.match(/(?<=^|\|)(".*"|[^|]+)(?=\||$)/g); + if (row.length !== 11) { + console.log(`Invalid row: ${line}`); + return; + } + const torrent = { + infoHash: row[0].toLowerCase(), + title: row[1] + .replace(/^"|"$/g, '') + .replace(/&/g, '&') + .replace(/&\w{2,6};/g, ' ') + .replace(/\s+/g, ' ') + .trim(), + category: row[2], + size: parseInt(row[5], 10), + seeders: parseInt(row[8], 10), + uploadDate: moment.unix(parseInt(row[10], 10)).toDate(), + }; + + if (!limiter.empty()) { + lr.pause() + } + + limiter.schedule(() => processTorrentRecord(torrent) + .catch((error) => console.log(`failed ${torrent.title} due: ${error}`))) + .then(() => limiter.empty()) + .then((empty) => empty && lr.resume()) + .then(() => entriesProcessed++); + }); + lr.on('error', (err) => { + console.log(err); + }); + lr.on('end', () => { + fs.unlink(CSV_FILE_PATH); + console.log(`finished to scrape KAT dump: ${JSON.stringify(lastDump)}!`); + }); +} + +const categoryMapping = { + "Movies": Type.MOVIE, + "TV": Type.SERIES, + "Anime": Type.ANIME +}; + +async function processTorrentRecord(record) { + if (!categoryMapping[record.category] || record.seeders === 0) { + return createSkipTorrentEntry(record); + } + if (await getStoredTorrentEntry(record)) { + return; + } + + const torrentFound = await findTorrent(record).catch(() => undefined); + + if (!torrentFound) { + return createSkipTorrentEntry(record); + } + + const torrent = { + infoHash: record.infoHash, + provider: NAME, + title: torrentFound.name, + size: record.size, + type: categoryMapping[record.category], + imdbId: torrentFound.imdbId, + uploadDate: record.uploadDate, + seeders: torrentFound.seeders, + }; + + return createTorrentEntry(torrent); +} + +async function findTorrent(record) { + return Promise.reject("not found"); +} + +module.exports = { scrape }; \ No newline at end of file diff --git a/scrapers/thepiratebay/thepiratebay_api.js b/scrapers/thepiratebay/thepiratebay_api.js index 998532d..1c723ad 100644 --- a/scrapers/thepiratebay/thepiratebay_api.js +++ b/scrapers/thepiratebay/thepiratebay_api.js @@ -3,7 +3,7 @@ const needle = require('needle'); const moment = require('moment'); const defaultProxies = [ - 'https://thepiratebay.org', + 'https://thepiratebay.org', 'https://piratebays.icu', 'https://piratebays.cool', 'https://piratebays.life']; @@ -89,6 +89,7 @@ function torrent(torrentId, config = {}, retries = 2) { return raceFirstSuccessful(proxyList .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config))) .then((body) => parseTorrentPage(body)) + .then((torrent) => ({ torrentId, ...torrent })) .catch((err) => torrent(torrentId, config, retries - 1)); } @@ -126,7 +127,7 @@ function singleRequest(requestUrl, config = {}) { const timeout = config.timeout || defaultTimeout; return needle('get', requestUrl, { open_timeout: timeout, follow: 2 }) - .then((response) => { + .then((response) => { const body = response.body; if (!body) { throw new Error(`No body: ${requestUrl}`); @@ -154,19 +155,22 @@ function parseBody(body) { const torrents = []; - $('table[id=\'searchResult\'] tr').each(function() { + $('table[id=\'searchResult\'] tr').each(function () { const name = $(this).find('.detLink').text(); const sizeMatcher = $(this).find('.detDesc').text().match(/(?:,\s?Size\s)(.+),/); if (!name || !sizeMatcher) { return; } torrents.push({ + torrentId: $(this).find('.detLink').attr('href').match(/torrent\/([^/]+)/)[1], name: name, seeders: parseInt($(this).find('td[align=\'right\']').eq(0).text(), 10), leechers: parseInt($(this).find('td[align=\'right\']').eq(1).text(), 10), magnetLink: $(this).find('a[title=\'Download this torrent using magnet\']').attr('href'), - category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), - subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], 10), + category: parseInt($(this).find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], + 10), + subcategory: parseInt($(this).find('a[title=\'More from this category\']').eq(1).attr('href').match(/\d+$/)[0], + 10), size: parseSize(sizeMatcher[1]) }); }); @@ -181,15 +185,20 @@ function parseTorrentPage(body) { if (!$) { reject(new Error(errors.PARSER_ERROR)); } + const details = $('div[id=\'details\']'); + const col1 = details.find('dl[class=\'col1\']'); + const imdbIdMatch = col1.html().match(/imdb\.com\/title\/(tt\d+)/i); const torrent = { - name: $('div[id=\'title\']').text().trim(), - seeders: parseInt($('dl[class=\'col2\']').find('dd').eq(2).text(), 10), - leechers: parseInt($('dl[class=\'col2\']').find('dd').eq(3).text(), 10), - magnetLink: $('div[id=\'details\']').find('a[title=\'Get this torrent\']').attr('href'), - category: Categories.VIDEO.ALL, - subcategory: parseInt($('dl[class=\'col1\']').find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), - size: parseSize($('dl[class=\'col1\']').find('dd').eq(2).text().match(/(\d+)(?:.?Bytes)/)[1]) + name: $('div[id=\'title\']').text().trim(), + seeders: parseInt(details.find('dt:contains(\'Seeders:\')').next().text(), 10), + leechers: parseInt(details.find('dt:contains(\'Leechers:\')').next().text(), 10), + magnetLink: details.find('a[title=\'Get this torrent\']').attr('href'), + category: Categories.VIDEO.ALL, + subcategory: parseInt(col1.find('a[title=\'More from this category\']').eq(0).attr('href').match(/\d+$/)[0], 10), + size: parseSize(details.find('dt:contains(\'Size:\')').next().text().match(/(\d+)(?:.?Bytes)/)[1]), + uploadDate: new Date(details.find('dt:contains(\'Uploaded:\')').next().text()), + imdbId: imdbIdMatch && imdbIdMatch[1] }; resolve(torrent); }); diff --git a/scrapers/thepiratebay/thepiratebay_scraper.js b/scrapers/thepiratebay/thepiratebay_dump_scraper.js similarity index 74% rename from scrapers/thepiratebay/thepiratebay_scraper.js rename to scrapers/thepiratebay/thepiratebay_dump_scraper.js index 365514e..cb817d5 100644 --- a/scrapers/thepiratebay/thepiratebay_scraper.js +++ b/scrapers/thepiratebay/thepiratebay_dump_scraper.js @@ -4,13 +4,11 @@ const Bottleneck = require('bottleneck'); const { ungzip } = require('node-gzip'); const LineByLineReader = require('line-by-line'); const fs = require('fs'); -const { parse } = require('parse-torrent-title'); const thepiratebay = require('./thepiratebay_api.js'); const bing = require('nodejs-bing'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); -const { getImdbId, escapeTitle } = require('../../lib/metadata'); -const { parseTorrentFiles } = require('../../lib/torrentFiles'); +const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); const NAME = 'ThePirateBay'; const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; @@ -48,7 +46,8 @@ async function scrape() { .replace(/^"|"$/g, '') .replace(/&/g, '&') .replace(/&\w{2,6};/g, ' ') - .replace(/\s+/g, ' '), + .replace(/\s+/g, ' ') + .trim(), size: parseInt(row[3], 10) }; @@ -77,7 +76,7 @@ async function scrape() { }); lr.on('end', () => { fs.unlink(CSV_FILE_PATH); - updateProvider({ name: NAME, lastScraped: lastDump.updatedAt }); + repository.updateProvider({ name: NAME, lastScraped: lastDump.updatedAt }); console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`); }); } @@ -97,64 +96,34 @@ const seriesCategories = [ ]; async function processTorrentRecord(record) { - const alreadyExists = await repository.getSkipTorrent(record) - .catch(() => repository.getTorrent(record)) - .catch(() => undefined); - if (alreadyExists) { + if (await getStoredTorrentEntry(record)) { return; } const torrentFound = await findTorrent(record); - if (!torrentFound) { - //console.log(`not found: ${JSON.stringify(record)}`); - repository.createSkipTorrent(record); - return; - } - if (!allowedCategories.includes(torrentFound.subcategory)) { - //console.log(`wrong category: ${torrentFound.name}`); - repository.createSkipTorrent(record); - return; + if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) { + return createSkipTorrentEntry(record); } - const type = seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE; - const titleInfo = parse(torrentFound.name); - const imdbId = await getImdbId({ - name: escapeTitle(titleInfo.title).toLowerCase(), - year: titleInfo.year, - type: type - }).catch((error) => undefined); const torrent = { infoHash: record.infoHash, provider: NAME, + torrentId: record.torrentId, title: torrentFound.name, - size: record.size, - type: type, - imdbId: imdbId, - uploadDate: record.uploadDate, + size: torrentFound.size, + type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate || record.uploadDate, seeders: torrentFound.seeders, }; - if (!torrent.imdbId && !titleInfo.complete) { - console.log(`imdbId not found: ${torrentFound.name}`); - repository.createFailedImdbTorrent(torrent); - return; - } - - const files = await parseTorrentFiles(torrent); - if (!files || !files.length) { - console.log(`no video files found: ${torrentFound.name}`); - return; - } - - repository.createTorrent(torrent) - .then(() => files.forEach(file => repository.createFile(file))) - .then(() => console.log(`Created entry for ${torrentFound.name}`)); + return createTorrentEntry(torrent); } async function findTorrent(record) { return findTorrentInSource(record) - .catch((error) => findTorrentViaBing(record)); + .catch(() => findTorrentViaBing(record)); } async function findTorrentInSource(record) { @@ -168,7 +137,8 @@ async function findTorrentInSource(record) { if (!torrentFound) { return Promise.reject(new Error(`Failed to find torrent ${record.title}`)); } - return Promise.resolve(torrentFound); + return Promise.resolve(torrentFound) + .then((torrent) => thepiratebay.torrent(torrent.torrentId)); } async function findTorrentViaBing(record) { diff --git a/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js b/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js new file mode 100644 index 0000000..e74d3d8 --- /dev/null +++ b/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js @@ -0,0 +1,113 @@ +const moment = require('moment'); +const Bottleneck = require('bottleneck'); +const LineByLineReader = require('line-by-line'); +const fs = require('fs'); +const decode = require('magnet-uri'); +const thepiratebay = require('./thepiratebay_api.js'); +const { Type } = require('../../lib/types'); +const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); + +const NAME = 'ThePirateBay'; +const CSV_FILE_PATH = '/tmp/tpb.csv'; + +const limiter = new Bottleneck({ maxConcurrent: 40 }); + +async function scrape() { + console.log(`starting to scrape tpb dump...`); + //const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); + const checkPoint = 951000; + + let entriesProcessed = 0; + const lr = new LineByLineReader(CSV_FILE_PATH); + lr.on('line', (line) => { + if (entriesProcessed % 1000 === 0) { + console.log(`Processed ${entriesProcessed} entries`); + } + if (entriesProcessed <= checkPoint) { + entriesProcessed++; + return; + } + + const row = line.match(/(?<=^|,)(".*"|[^,]*)(?=,|$)/g); + if (row.length !== 10) { + console.log(`Invalid row: ${line}`); + return; + } + const torrent = { + torrentId: row[0], + title: row[1] + .replace(/^"|"$/g, '') + .replace(/&/g, '&') + .replace(/&\w{2,6};/g, ' ') + .replace(/\s+/g, ' ') + .trim(), + size: parseInt(row[2], 10), + category: row[4], + subcategory: row[5], + infoHash: row[7].toLowerCase() || decode(row[9]).infoHash, + magnetLink: row[9], + uploadDate: moment(row[8]).toDate(), + }; + + if (!limiter.empty()) { + lr.pause() + } + + limiter.schedule(() => processTorrentRecord(torrent) + .catch((error) => console.log(`failed ${torrent.title} due: ${error}`))) + .then(() => limiter.empty()) + .then((empty) => empty && lr.resume()) + .then(() => entriesProcessed++); + }); + lr.on('error', (err) => { + console.log(err); + }); + lr.on('end', () => { + fs.unlink(CSV_FILE_PATH); + console.log(`finished to scrape tpb dump!`); + }); +} + +const allowedCategories = [ + thepiratebay.Categories.VIDEO.MOVIES, + thepiratebay.Categories.VIDEO.MOVIES_HD, + thepiratebay.Categories.VIDEO.MOVIES_DVDR, + thepiratebay.Categories.VIDEO.MOVIES_3D, + thepiratebay.Categories.VIDEO.TV_SHOWS, + thepiratebay.Categories.VIDEO.TV_SHOWS_HD +]; +const seriesCategories = [ + thepiratebay.Categories.VIDEO.TV_SHOWS, + thepiratebay.Categories.VIDEO.TV_SHOWS_HD +]; + +async function processTorrentRecord(record) { + if (record.category !== 'Video') { + return createSkipTorrentEntry(record); + } + if (await getStoredTorrentEntry(record)) { + return; + } + + const torrentFound = await thepiratebay.torrent(record.torrentId).catch(() => undefined); + + if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) { + return createSkipTorrentEntry(record); + } + + const torrent = { + infoHash: record.infoHash, + provider: NAME, + torrentId: record.torrentId, + title: torrentFound.name, + size: torrentFound.size, + type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE, + imdbId: torrentFound.imdbId, + uploadDate: torrentFound.uploadDate, + seeders: torrentFound.seeders, + }; + + return createTorrentEntry(torrent); +} + +module.exports = { scrape }; \ No newline at end of file