mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
[scraper] updates tpb dum scraper
This commit is contained in:
@@ -81,11 +81,14 @@ const FailedImdbTorrent = database.define('failed_imdb_torrent', {
|
|||||||
});
|
});
|
||||||
|
|
||||||
function connect() {
|
function connect() {
|
||||||
return database.sync({ alter: true })
|
if (process.env.ENABLE_SYNC) {
|
||||||
.catch(error => {
|
return database.sync({ alter: true })
|
||||||
console.error('Failed syncing database: ', error);
|
.catch(error => {
|
||||||
throw error;
|
console.error('Failed syncing database: ', error);
|
||||||
});
|
throw error;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return Promise.resolve();
|
||||||
}
|
}
|
||||||
|
|
||||||
function getProvider(provider) {
|
function getProvider(provider) {
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ const { retrieveTorrentFiles } = require('./cache');
|
|||||||
|
|
||||||
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
|
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
|
||||||
const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs
|
const SEEDS_CHECK_TIMEOUT = process.env.SEEDS_CHECK_TIMEOUT || 10 * 1000; // 10 secs
|
||||||
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv"];
|
const EXTENSIONS = ["3g2", "3gp", "avi", "flv", "mkv", "mov", "mp2", "mp4", "mpe", "mpeg", "mpg", "mpv", "webm", "wmv",
|
||||||
|
"ogm"];
|
||||||
|
|
||||||
module.exports.updateCurrentSeeders = function (torrent) {
|
module.exports.updateCurrentSeeders = function (torrent) {
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
|
|||||||
@@ -8,7 +8,8 @@ const defaultProxies = [
|
|||||||
'https://thepiratebay.org',
|
'https://thepiratebay.org',
|
||||||
'https://proxybay.pro',
|
'https://proxybay.pro',
|
||||||
'https://ukpiratebayproxy.com',
|
'https://ukpiratebayproxy.com',
|
||||||
'https://thepiratebayproxy.info'];
|
'https://thepiratebayproxy.info'
|
||||||
|
];
|
||||||
const dumpUrl = '/static/dump/csv/';
|
const dumpUrl = '/static/dump/csv/';
|
||||||
const defaultTimeout = 10000;
|
const defaultTimeout = 10000;
|
||||||
|
|
||||||
@@ -84,8 +85,8 @@ function torrent(torrentId, config = {}, retries = 2) {
|
|||||||
const proxyList = config.proxyList || defaultProxies;
|
const proxyList = config.proxyList || defaultProxies;
|
||||||
|
|
||||||
return Promises.first(proxyList
|
return Promises.first(proxyList
|
||||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config)))
|
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}/`, config)
|
||||||
.then((body) => parseTorrentPage(body))
|
.then((body) => parseTorrentPage(body))))
|
||||||
.then((torrent) => ({ torrentId, ...torrent }))
|
.then((torrent) => ({ torrentId, ...torrent }))
|
||||||
.catch((err) => torrent(torrentId, config, retries - 1));
|
.catch((err) => torrent(torrentId, config, retries - 1));
|
||||||
}
|
}
|
||||||
@@ -99,8 +100,8 @@ function search(keyword, config = {}, retries = 2) {
|
|||||||
const category = config.category || 0;
|
const category = config.category || 0;
|
||||||
|
|
||||||
return Promises.first(proxyList
|
return Promises.first(proxyList
|
||||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config)))
|
.map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config)
|
||||||
.then((body) => parseBody(body))
|
.then((body) => parseBody(body))))
|
||||||
.catch((err) => search(keyword, config, retries - 1));
|
.catch((err) => search(keyword, config, retries - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -113,8 +114,8 @@ function browse(config = {}, retries = 2) {
|
|||||||
const category = config.category || 0;
|
const category = config.category || 0;
|
||||||
|
|
||||||
return Promises.first(proxyList
|
return Promises.first(proxyList
|
||||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config)))
|
.map((proxyUrl) => singleRequest(`${proxyUrl}/browse/${category}/${page}`, config)
|
||||||
.then((body) => parseBody(body))
|
.then((body) => parseBody(body))))
|
||||||
.catch((err) => browse(config, retries - 1));
|
.catch((err) => browse(config, retries - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,12 +17,11 @@ const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
|||||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||||
|
|
||||||
async function scrape() {
|
async function scrape() {
|
||||||
const lastScraped = await repository.getProvider({ name: NAME });
|
|
||||||
const lastDump = { updatedAt: 2147000000 };
|
const lastDump = { updatedAt: 2147000000 };
|
||||||
//const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
//const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
||||||
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
||||||
|
|
||||||
if (!lastScraped.lastScraped || lastScraped.lastScraped < lastDump.updatedAt) {
|
if (lastDump) {
|
||||||
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
|
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
|
||||||
await downloadDump(lastDump);
|
await downloadDump(lastDump);
|
||||||
|
|
||||||
@@ -56,11 +55,6 @@ async function scrape() {
|
|||||||
// return;
|
// return;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
if (lastScraped.lastScraped && lastScraped.lastScraped > torrent.uploadDate) {
|
|
||||||
// torrent was already scraped previously, skipping
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!limiter.empty()) {
|
if (!limiter.empty()) {
|
||||||
lr.pause()
|
lr.pause()
|
||||||
}
|
}
|
||||||
@@ -138,7 +132,8 @@ async function findTorrentInSource(record) {
|
|||||||
return Promise.reject(new Error(`Failed to find torrent ${record.title}`));
|
return Promise.reject(new Error(`Failed to find torrent ${record.title}`));
|
||||||
}
|
}
|
||||||
return Promise.resolve(torrentFound)
|
return Promise.resolve(torrentFound)
|
||||||
.then((torrent) => thepiratebay.torrent(torrent.torrentId));
|
.then((torrent) => thepiratebay.torrent(torrent.torrentId)
|
||||||
|
.catch(() => thepiratebay.torrent(torrent.torrentId)));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function findTorrentViaBing(record) {
|
async function findTorrentViaBing(record) {
|
||||||
@@ -148,11 +143,12 @@ async function findTorrentViaBing(record) {
|
|||||||
result.description.includes('Get this torrent')))
|
result.description.includes('Get this torrent')))
|
||||||
.then((result) => {
|
.then((result) => {
|
||||||
if (!result) {
|
if (!result) {
|
||||||
throw new Error(`Failed to find torrent ${record.title}`);
|
console.warn(`Failed to find torrent ${record.title}`);
|
||||||
|
return Promise.resolve(undefined);
|
||||||
}
|
}
|
||||||
return result.link.match(/torrent\/(\w+)\//)[1];
|
return result.link.match(/torrent\/(\w+)\//)[1];
|
||||||
})
|
})
|
||||||
.then((torrentId) => thepiratebay.torrent(torrentId))
|
.then((torrentId) => torrentId && thepiratebay.torrent(torrentId))
|
||||||
}
|
}
|
||||||
|
|
||||||
function downloadDump(dump) {
|
function downloadDump(dump) {
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
const Bottleneck = require('bottleneck');
|
const Bottleneck = require('bottleneck');
|
||||||
const LineByLineReader = require('line-by-line');
|
const LineByLineReader = require('line-by-line');
|
||||||
const fs = require('fs');
|
|
||||||
const decode = require('magnet-uri');
|
const decode = require('magnet-uri');
|
||||||
const thepiratebay = require('./thepiratebay_api.js');
|
const thepiratebay = require('./thepiratebay_api.js');
|
||||||
const { Type } = require('../../lib/types');
|
const { Type } = require('../../lib/types');
|
||||||
@@ -13,6 +12,7 @@ const CSV_FILE_PATH = '/tmp/tpb.csv';
|
|||||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||||
|
|
||||||
async function scrape() {
|
async function scrape() {
|
||||||
|
// await processTorrentRecord({ torrentId: 35313644, category: 'Video' });
|
||||||
console.log(`starting to scrape tpb dump...`);
|
console.log(`starting to scrape tpb dump...`);
|
||||||
//const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
//const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
||||||
const checkPoint = 4115000;
|
const checkPoint = 4115000;
|
||||||
@@ -88,16 +88,16 @@ async function processTorrentRecord(record) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const torrentFound = await thepiratebay.torrent(record.torrentId).catch(() => undefined);
|
const torrentFound = await thepiratebay.torrent(record.torrentId);
|
||||||
|
|
||||||
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
|
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
|
||||||
return createSkipTorrentEntry(record);
|
return createSkipTorrentEntry(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
const torrent = {
|
const torrent = {
|
||||||
infoHash: record.infoHash,
|
infoHash: torrentFound.infoHash,
|
||||||
provider: NAME,
|
provider: NAME,
|
||||||
torrentId: record.torrentId,
|
torrentId: torrentFound.torrentId,
|
||||||
title: torrentFound.name,
|
title: torrentFound.name,
|
||||||
size: torrentFound.size,
|
size: torrentFound.size,
|
||||||
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
|
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
|
||||||
|
|||||||
Reference in New Issue
Block a user