[scraper] updates tpb dum scraper

This commit is contained in:
TheBeastLT
2020-03-15 11:24:18 +01:00
parent 3ca4479d8a
commit 2ea787c0c7
5 changed files with 28 additions and 27 deletions

View File

@@ -81,11 +81,14 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', {
}); });
function connect() { function connect() {
return database.sync({ alter: true }) if (process.env.ENABLE_SYNC) {
.catch(error => { return database.sync({ alter: true })
console.error('Failed syncing database: ', error); .catch(error => {
throw error; console.error('Failed syncing database: ', error);
}); throw error;
});
}
return Promise.resolve();
} }
function getProvider(provider) { function getProvider(provider) {

View File

@@ -7,7 +7,8 @@ const { retrieveTorrentFiles } = require('./cache');
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20; const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"]; const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv",
"ogm"];
module.exports.updateCurrentSeeders = function (torrent) { module.exports.updateCurrentSeeders = function (torrent) {
return new Promise((resolve) => { return new Promise((resolve) => {

View File

@@ -8,7 +8,8 @@ const defaultProxies = [
'https://thepiratebay.org', 'https://thepiratebay.org',
'https://proxybay.pro', 'https://proxybay.pro',
'https://ukpiratebayproxy.com', 'https://ukpiratebayproxy.com',
'https://thepiratebayproxy.info']; 'https://thepiratebayproxy.info'
];
const dumpUrl = '/static/dump/csv/'; const dumpUrl = '/static/dump/csv/';
const defaultTimeout = 10000; const defaultTimeout = 10000;
@@ -84,8 +85,8 @@ function torrent(torrentId, config = {}, retries = 2) {
const proxyList = config.proxyList || defaultProxies; const proxyList = config.proxyList || defaultProxies;
return Promises.first(proxyList return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config))) .map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}/`, config)
.then((body) => parseTorrentPage(body)) .then((body) => parseTorrentPage(body))))
.then((torrent) => ({ torrentId, ...torrent })) .then((torrent) => ({ torrentId, ...torrent }))
.catch((err) => torrent(torrentId, config, retries - 1)); .catch((err) => torrent(torrentId, config, retries - 1));
} }
@@ -99,8 +100,8 @@ function search(keyword, config = {}, retries = 2) {
const category = config.category || 0; const category = config.category || 0;
return Promises.first(proxyList return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config))) .map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config)
.then((body) => parseBody(body)) .then((body) => parseBody(body))))
.catch((err) => search(keyword, config, retries - 1)); .catch((err) => search(keyword, config, retries - 1));
} }
@@ -113,8 +114,8 @@ function browse(config = {}, retries = 2) {
const category = config.category || 0; const category = config.category || 0;
return Promises.first(proxyList return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config))) .map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config)
.then((body) => parseBody(body)) .then((body) => parseBody(body))))
.catch((err) => browse(config, retries - 1)); .catch((err) => browse(config, retries - 1));
} }

View File

@@ -17,12 +17,11 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
const limiter = new Bottleneck({ maxConcurrent: 40 }); const limiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() { async function scrape() {
const lastScraped = await repository.getProvider({ name: NAME });
const lastDump = { updatedAt: 2147000000 }; const lastDump = { updatedAt: 2147000000 };
//const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); //const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) { if (lastDump) {
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
await downloadDump(lastDump); await downloadDump(lastDump);
@@ -56,11 +55,6 @@ async function scrape() {
// return; // return;
// } // }
if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) {
// torrent was already scraped previously, skipping
return;
}
if (!limiter.empty()) { if (!limiter.empty()) {
lr.pause() lr.pause()
} }
@@ -138,7 +132,8 @@ async function findTorrentInSource(record) {
return Promise.reject(new Error(`Failed to find torrent ${record.title}`)); return Promise.reject(new Error(`Failed to find torrent ${record.title}`));
} }
return Promise.resolve(torrentFound) return Promise.resolve(torrentFound)
.then((torrent) => thepiratebay.torrent(torrent.torrentId)); .then((torrent) => thepiratebay.torrent(torrent.torrentId)
.catch(() => thepiratebay.torrent(torrent.torrentId)));
} }
async function findTorrentViaBing(record) { async function findTorrentViaBing(record) {
@@ -148,11 +143,12 @@ async function findTorrentViaBing(record) {
result.description.includes('Get this torrent'))) result.description.includes('Get this torrent')))
.then((result) => { .then((result) => {
if (!result) { if (!result) {
throw new Error(`Failed to find torrent ${record.title}`); console.warn(`Failed to find torrent ${record.title}`);
return Promise.resolve(undefined);
} }
return result.link.match(/torrent\/(\w+)\//)[1]; return result.link.match(/torrent\/(\w+)\//)[1];
}) })
.then((torrentId) => thepiratebay.torrent(torrentId)) .then((torrentId) => torrentId && thepiratebay.torrent(torrentId))
} }
function downloadDump(dump) { function downloadDump(dump) {

View File

@@ -1,7 +1,6 @@
const moment = require('moment'); const moment = require('moment');
const Bottleneck = require('bottleneck'); const Bottleneck = require('bottleneck');
const LineByLineReader = require('line-by-line'); const LineByLineReader = require('line-by-line');
const fs = require('fs');
const decode = require('magnet-uri'); const decode = require('magnet-uri');
const thepiratebay = require('./thepiratebay_api.js'); const thepiratebay = require('./thepiratebay_api.js');
const { Type } = require('../../lib/types'); const { Type } = require('../../lib/types');
@@ -13,6 +12,7 @@ const CSV_FILE_PATH = '/tmp/tpb.csv';
const limiter = new Bottleneck({ maxConcurrent: 40 }); const limiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() { async function scrape() {
// await processTorrentRecord({ torrentId: 35313644, category: 'Video' });
console.log(`starting to scrape tpb dump...`); console.log(`starting to scrape tpb dump...`);
//const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); //const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
const checkPoint = 4115000; const checkPoint = 4115000;
@@ -88,16 +88,16 @@ async function processTorrentRecord(record) {
return; return;
} }
const torrentFound = await thepiratebay.torrent(record.torrentId).catch(() => undefined); const torrentFound = await thepiratebay.torrent(record.torrentId);
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) { if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
return createSkipTorrentEntry(record); return createSkipTorrentEntry(record);
} }
const torrent = { const torrent = {
infoHash: record.infoHash, infoHash: torrentFound.infoHash,
provider: NAME, provider: NAME,
torrentId: record.torrentId, torrentId: torrentFound.torrentId,
title: torrentFound.name, title: torrentFound.name,
size: torrentFound.size, size: torrentFound.size,
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE, type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,