diff --git a/scraper/lib/repository.js b/scraper/lib/repository.js index e498f63..3fe73d3 100644 --- a/scraper/lib/repository.js +++ b/scraper/lib/repository.js @@ -81,11 +81,14 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', { }); function connect() { - return database.sync({ alter: true }) - .catch(error => { - console.error('Failed syncing database: ', error); - throw error; - }); + if (process.env.ENABLE_SYNC) { + return database.sync({ alter: true }) + .catch(error => { + console.error('Failed syncing database: ', error); + throw error; + }); + } + return Promise.resolve(); } function getProvider(provider) { diff --git a/scraper/lib/torrent.js b/scraper/lib/torrent.js index e0aaaf8..ab8353b 100644 --- a/scraper/lib/torrent.js +++ b/scraper/lib/torrent.js @@ -7,7 +7,8 @@ const { retrieveTorrentFiles } = require('./cache'); const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs -const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"]; +const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv", + "ogm"]; module.exports.updateCurrentSeeders = function (torrent) { return new Promise((resolve) => { diff --git a/scraper/scrapers/thepiratebay/thepiratebay_api.js b/scraper/scrapers/thepiratebay/thepiratebay_api.js index 6e76243..0e45289 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_api.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_api.js @@ -8,7 +8,8 @@ const defaultProxies = [ 'https://thepiratebay.org', 'https://proxybay.pro', 'https://ukpiratebayproxy.com', - 'https://thepiratebayproxy.info']; + 'https://thepiratebayproxy.info' +]; const dumpUrl = '/static/dump/csv/'; const defaultTimeout = 10000; @@ -84,8 +85,8 @@ function torrent(torrentId, config = {}, retries = 2) { const proxyList = config.proxyList || defaultProxies; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config))) - .then((body) => parseTorrentPage(body)) + .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}/`, config) + .then((body) => parseTorrentPage(body)))) .then((torrent) => ({ torrentId, ...torrent })) .catch((err) => torrent(torrentId, config, retries - 1)); } @@ -99,8 +100,8 @@ function search(keyword, config = {}, retries = 2) { const category = config.category || 0; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config))) - .then((body) => parseBody(body)) + .map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config) + .then((body) => parseBody(body)))) .catch((err) => search(keyword, config, retries - 1)); } @@ -113,8 +114,8 @@ function browse(config = {}, retries = 2) { const category = config.category || 0; return Promises.first(proxyList - .map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config))) - .then((body) => parseBody(body)) + .map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config) + .then((body) => parseBody(body)))) .catch((err) => browse(config, retries - 1)); } diff --git a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js index e72645d..86ce85f 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js @@ -17,12 +17,11 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv'; const limiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { - const lastScraped = await repository.getProvider({ name: NAME }); const lastDump = { updatedAt: 2147000000 }; //const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); - if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) { + if (lastDump) { console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); await downloadDump(lastDump); @@ -56,11 +55,6 @@ async function scrape() { // return; // } - if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) { - // torrent was already scraped previously, skipping - return; - } - if (!limiter.empty()) { lr.pause() } @@ -138,7 +132,8 @@ async function findTorrentInSource(record) { return Promise.reject(new Error(`Failed to find torrent ${record.title}`)); } return Promise.resolve(torrentFound) - .then((torrent) => thepiratebay.torrent(torrent.torrentId)); + .then((torrent) => thepiratebay.torrent(torrent.torrentId) + .catch(() => thepiratebay.torrent(torrent.torrentId))); } async function findTorrentViaBing(record) { @@ -148,11 +143,12 @@ async function findTorrentViaBing(record) { result.description.includes('Get this torrent'))) .then((result) => { if (!result) { - throw new Error(`Failed to find torrent ${record.title}`); + console.warn(`Failed to find torrent ${record.title}`); + return Promise.resolve(undefined); } return result.link.match(/torrent\/(\w+)\//)[1]; }) - .then((torrentId) => thepiratebay.torrent(torrentId)) + .then((torrentId) => torrentId && thepiratebay.torrent(torrentId)) } function downloadDump(dump) { diff --git a/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js index 6cda1f8..43be053 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js @@ -1,7 +1,6 @@ const moment = require('moment'); const Bottleneck = require('bottleneck'); const LineByLineReader = require('line-by-line'); -const fs = require('fs'); const decode = require('magnet-uri'); const thepiratebay = require('./thepiratebay_api.js'); const { Type } = require('../../lib/types'); @@ -13,6 +12,7 @@ const CSV_FILE_PATH = '/tmp/tpb.csv'; const limiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { + // await processTorrentRecord({ torrentId: 35313644, category: 'Video' }); console.log(`starting to scrape tpb dump...`); //const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); const checkPoint = 4115000; @@ -88,16 +88,16 @@ async function processTorrentRecord(record) { return; } - const torrentFound = await thepiratebay.torrent(record.torrentId).catch(() => undefined); + const torrentFound = await thepiratebay.torrent(record.torrentId); if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) { return createSkipTorrentEntry(record); } const torrent = { - infoHash: record.infoHash, + infoHash: torrentFound.infoHash, provider: NAME, - torrentId: record.torrentId, + torrentId: torrentFound.torrentId, title: torrentFound.name, size: torrentFound.size, type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,