From 7e7e46a5f328b1ba02e10652e696311153b952a3 Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Mon, 16 Mar 2020 14:48:57 +0100 Subject: [PATCH] [scraper] updates tpb proxies --- addon/package.json | 2 +- package-lock.json | 4 ++-- package.json | 2 +- scraper/index.js | 6 ++++-- scraper/manual/manual.js | 4 ++-- scraper/scrapers/kickass/kickass_api.js | 2 +- scraper/scrapers/thepiratebay/thepiratebay_api.js | 6 ++++-- scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js | 7 ++++++- .../thepiratebay/thepiratebay_unofficial_dump_scraper.js | 2 +- 9 files changed, 22 insertions(+), 13 deletions(-) diff --git a/addon/package.json b/addon/package.json index 292e3fa..d80ed50 100644 --- a/addon/package.json +++ b/addon/package.json @@ -11,7 +11,7 @@ "cache-manager": "^2.9.0", "cache-manager-mongodb": "^0.2.1", "express-rate-limit": "^5.1.1", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#7c00602bc1c405f5574758eeabb72b133fea81d5", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#7259b01bfe6e1fbc3879ba68d9c58ebac84029e9", "pg": "^7.8.2", "pg-hstore": "^2.3.2", "sequelize": "^4.43.0", diff --git a/package-lock.json b/package-lock.json index 07c51bb..a194ed7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1385,8 +1385,8 @@ } }, "parse-torrent-title": { - "version": "git://github.com/TheBeastLT/parse-torrent-title.git#7c00602bc1c405f5574758eeabb72b133fea81d5", - "from": "git://github.com/TheBeastLT/parse-torrent-title.git#7c00602bc1c405f5574758eeabb72b133fea81d5", + "version": "git://github.com/TheBeastLT/parse-torrent-title.git#7259b01bfe6e1fbc3879ba68d9c58ebac84029e9", + "from": "git://github.com/TheBeastLT/parse-torrent-title.git#7259b01bfe6e1fbc3879ba68d9c58ebac84029e9", "requires": { "moment": "^2.24.0" } diff --git a/package.json b/package.json index 932d1d7..777747e 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "node-schedule": "^1.3.2", "nodejs-bing": "^0.1.0", "parse-torrent": "^6.1.2", - "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#7c00602bc1c405f5574758eeabb72b133fea81d5", + "parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#7259b01bfe6e1fbc3879ba68d9c58ebac84029e9", "peer-search": "^0.6.x", "pg": "^7.8.2", "pg-hstore": "^2.3.2", diff --git a/scraper/index.js b/scraper/index.js index cc00b10..f76c169 100644 --- a/scraper/index.js +++ b/scraper/index.js @@ -13,12 +13,14 @@ const thepiratebayDumpScraper = require('./scrapers/thepiratebay/thepiratebay_du const thepiratebayUnofficialDumpScraper = require('./scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper'); const PROVIDERS = [ - // horribleSubsScraper, - // rarbgScraper, + horribleSubsScraper, + rarbgScraper, thepiratebayScraper, kickassScraper, leetxScraper // rarbgDumpScraper + // thepiratebayDumpScraper + // thepiratebayUnofficialDumpScraper ]; const SCRAPE_CRON = process.env.SCRAPE_CRON || '* * 0/4 * * *'; diff --git a/scraper/manual/manual.js b/scraper/manual/manual.js index fa7fa5c..aa8866f 100644 --- a/scraper/manual/manual.js +++ b/scraper/manual/manual.js @@ -190,7 +190,7 @@ async function findAllFiles() { //addMissingEpisodes().then(() => console.log('Finished')); //findAllFiles().then(() => console.log('Finished')); //updateMovieCollections().then(() => console.log('Finished')); -reapplyEpisodeDecomposing('aec7bcac457ad68924e7119f859cf6fa3878f9f5', false).then(() => console.log('Finished')); +reapplyEpisodeDecomposing('0b6c0f0692bdb151efb87e3de90e46e3b177444e', false).then(() => console.log('Finished')); //reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished')); -// reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished')); +//reapplyDecomposingToTorrentsOnRegex('.*Boku no Hero Academia.*').then(() => console.log('Finished')); //reapplyManualHashes().then(() => console.log('Finished')); \ No newline at end of file diff --git a/scraper/scrapers/kickass/kickass_api.js b/scraper/scrapers/kickass/kickass_api.js index ac06d26..5ce6952 100644 --- a/scraper/scrapers/kickass/kickass_api.js +++ b/scraper/scrapers/kickass/kickass_api.js @@ -7,7 +7,7 @@ const Promises = require('../../lib/promises'); const defaultProxies = [ 'https://katcr.co' ]; -const defaultTimeout = 10000; +const defaultTimeout = 30000; const Categories = { MOVIE: 'movies', diff --git a/scraper/scrapers/thepiratebay/thepiratebay_api.js b/scraper/scrapers/thepiratebay/thepiratebay_api.js index 0e45289..39b5797 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_api.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_api.js @@ -5,10 +5,12 @@ const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); const defaultProxies = [ - 'https://thepiratebay.org', + // 'https://thepiratebay.org', 'https://proxybay.pro', 'https://ukpiratebayproxy.com', - 'https://thepiratebayproxy.info' + 'https://thepiratebayproxy.info', + 'https://mypiratebay.co', + 'https://thepiratebay.asia', ]; const dumpUrl = '/static/dump/csv/'; const defaultTimeout = 10000; diff --git a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js index 86ce85f..8e7ff83 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js @@ -7,7 +7,6 @@ const fs = require('fs'); const thepiratebay = require('./thepiratebay_api.js'); const bing = require('nodejs-bing'); const { Type } = require('../../lib/types'); -const repository = require('../../lib/repository'); const { escapeHTML } = require('../../lib/metadata'); const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); @@ -20,6 +19,7 @@ async function scrape() { const lastDump = { updatedAt: 2147000000 }; //const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); //const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]); + const checkPoint = 611000; if (lastDump) { console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`); @@ -34,6 +34,11 @@ async function scrape() { if (entriesProcessed % 1000 === 0) { console.log(`Processed ${entriesProcessed} entries`); } + if (entriesProcessed <= checkPoint) { + entriesProcessed++; + return; + } + const row = line.match(/(?<=^|;)(".*"|[^;]+)(?=;|$)/g); if (row.length !== 4) { console.log(`Invalid row: ${line}`); diff --git a/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js index 43be053..31e6ae1 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper.js @@ -12,7 +12,7 @@ const CSV_FILE_PATH = '/tmp/tpb.csv'; const limiter = new Bottleneck({ maxConcurrent: 40 }); async function scrape() { - // await processTorrentRecord({ torrentId: 35313644, category: 'Video' }); + // await processTorrentRecord({ torrentId: 26877339, category: 'Video' }); console.log(`starting to scrape tpb dump...`); //const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate(); const checkPoint = 4115000;