Added back original scrapers, integrated with PGSQL
This commit is contained in:
@@ -1,4 +0,0 @@
|
||||
*.lock
|
||||
*/scrape-cache
|
||||
Dockerfile
|
||||
docker-compose.yml
|
||||
@@ -1,12 +0,0 @@
|
||||
FROM python:3.10.13
|
||||
COPY . .
|
||||
RUN pip --no-cache-dir install pipenv && \
|
||||
apt-get update && apt-get install sqlite3 && \
|
||||
apt-get clean autoclean && \
|
||||
apt-get autoremove --yes && \
|
||||
mkdir /sqlite && \
|
||||
sqlite3 /sqlite/torrentio.sqlite "VACUUM;" && \
|
||||
cat torrentio.sql | sqlite3 /sqlite/torrentio.sqlite && \
|
||||
ls -l /sqlite && \
|
||||
pipenv install && touch .init
|
||||
CMD [ "pipenv", "run", "python", "-u", "scraper.py" ]
|
||||
@@ -1,16 +0,0 @@
|
||||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
requests = "*"
|
||||
beautifulsoup4 = "*"
|
||||
imdbpy = "*"
|
||||
apscheduler = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.10"
|
||||
python_full_version = "3.10.13"
|
||||
@@ -1,15 +0,0 @@
|
||||
{
|
||||
"1337x": {
|
||||
"sleep": 10,
|
||||
"urls_to_scrape": [
|
||||
"https://1337x.to/trending/w/movies/",
|
||||
"https://1337x.to/trending/d/movies/"
|
||||
],
|
||||
"scrape_interval": {
|
||||
"days": 0,
|
||||
"hours" : 1,
|
||||
"minutes": 0,
|
||||
"seconds": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,30 +2,43 @@ version: '2'
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
restart: always
|
||||
image: docker.io/bitnami/mongodb:7.0
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- 'mongodb_data:/bitnami/mongodb'
|
||||
scraper:
|
||||
build: ./
|
||||
build: ../../scraper/
|
||||
restart: always
|
||||
volumes:
|
||||
- 'sqlite_data:/sqlite'
|
||||
ports:
|
||||
- "7001:7001"
|
||||
environment:
|
||||
- PORT=7001
|
||||
- MONGODB_URI=mongodb://mongodb:27017/torrentio
|
||||
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
|
||||
torrentio:
|
||||
build: ../
|
||||
volumes:
|
||||
- 'sqlite_data:/sqlite'
|
||||
restart: always
|
||||
ports:
|
||||
- "7000:7000"
|
||||
links:
|
||||
- "mongodb:mongodb"
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://mongodb:27017/torrentio
|
||||
- DATABASE_URI=sqlite:/sqlite/torrentio.sqlite
|
||||
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
|
||||
postgres:
|
||||
image: postgres:14-alpine
|
||||
ports:
|
||||
- 5432:5432
|
||||
volumes:
|
||||
- pg_data:/var/lib/postgresql/data
|
||||
- './init:/docker-entrypoint-initdb.d'
|
||||
environment:
|
||||
- POSTGRES_HOST_AUTH_METHOD=trust
|
||||
- POSTGRES_USER=postgres
|
||||
- POSTGRES_DB=torrentio
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
sqlite_data:
|
||||
pg_data:
|
||||
driver: local
|
||||
53
addon/hosted/init/torrentio-pgsql.sql
Normal file
53
addon/hosted/init/torrentio-pgsql.sql
Normal file
@@ -0,0 +1,53 @@
|
||||
CREATE TABLE "torrents" (
|
||||
"infoHash" varchar(64) PRIMARY KEY,
|
||||
"provider" varchar(32) NOT NULL,
|
||||
"torrentId" varchar(128),
|
||||
"title" varchar(256) NOT NULL,
|
||||
"size" bigint,
|
||||
"type" varchar(16) NOT NULL,
|
||||
"uploadDate" date NOT NULL,
|
||||
"seeders" integer,
|
||||
"trackers" varchar(4096),
|
||||
"languages" varchar(4096),
|
||||
"resolution" varchar(16),
|
||||
"createdAt" date,
|
||||
"updatedAt" date,
|
||||
"reviewed" boolean,
|
||||
"opened" boolean
|
||||
);
|
||||
|
||||
CREATE TABLE "files" (
|
||||
"id" SERIAL PRIMARY KEY,
|
||||
"infoHash" varchar(64) NOT NULL,
|
||||
"fileIndex" integer,
|
||||
"title" varchar(256) NOT NULL,
|
||||
"size" bigint,
|
||||
"imdbId" varchar(32),
|
||||
"imdbSeason" integer,
|
||||
"imdbEpisode" integer,
|
||||
"kitsuId" integer,
|
||||
"kitsuEpisode" integer,
|
||||
"createdAt" date,
|
||||
"updatedAt" date,
|
||||
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
|
||||
UNIQUE("infoHash","fileIndex")
|
||||
);
|
||||
|
||||
CREATE TABLE "subtitles" (
|
||||
"id" SERIAL PRIMARY KEY,
|
||||
"infoHash" varchar(64) NOT NULL,
|
||||
"fileIndex" integer NOT NULL,
|
||||
"fileId" bigint,
|
||||
"title" varchar(512) NOT NULL,
|
||||
"size" bigint,
|
||||
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
|
||||
FOREIGN KEY("fileId") REFERENCES files("id") ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE TABLE "contents" (
|
||||
"infoHash" varchar(64),
|
||||
"fileIndex" integer,
|
||||
"path" varchar(256),
|
||||
"size" bigint,
|
||||
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE
|
||||
);
|
||||
@@ -1,141 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import math
|
||||
import requests
|
||||
import time
|
||||
from imdb import Cinemagoer
|
||||
from bs4 import BeautifulSoup
|
||||
from shared import imdb_find, build_and_write, extract_title, read_config
|
||||
|
||||
TORRENT_CACHES = ('http://itorrents.org', 'http://torrage.info', 'http://btcache.me')
|
||||
BASE_URL = 'https://1337x.to'
|
||||
MOVIE_BASE = 'https://1337x.to/movie-library'
|
||||
MOVIE_LIBRARY_MAX_PAGE = 301
|
||||
CACHE_DIR = "./scrape-cache"
|
||||
PROVIDER = "1337x"
|
||||
SLEEP_BETWEEN_REQUESTS = read_config(PROVIDER, "sleep")
|
||||
|
||||
if not os.path.exists(CACHE_DIR):
|
||||
os.makedirs(CACHE_DIR)
|
||||
|
||||
def get_links_and_process(url):
|
||||
links = []
|
||||
print(f"Requesting movies from: {url}")
|
||||
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
main = req.text
|
||||
soup = BeautifulSoup(main, "html.parser")
|
||||
for a in soup.find_all("a"):
|
||||
if a.get("href").startswith("/torrent/"):
|
||||
links.append((a.get("href"), extract_title(a.text)))
|
||||
process_links(links)
|
||||
|
||||
def get_links_initial():
|
||||
links = []
|
||||
for i in range(1,MOVIE_LIBRARY_MAX_PAGE + 1):
|
||||
try:
|
||||
print(f"Sleeping {SLEEP_BETWEEN_REQUESTS}")
|
||||
time.sleep(SLEEP_BETWEEN_REQUESTS)
|
||||
|
||||
main = ""
|
||||
if os.path.isfile(f"{CACHE_DIR}/main-{i}.html"):
|
||||
print(f"Reading main page({i}) from cache...")
|
||||
main = open(f"{CACHE_DIR}/main-{i}.html", "r").read()
|
||||
else:
|
||||
print(f"Requesting main index: {MOVIE_BASE}/{i}/")
|
||||
req = requests.get(f"{MOVIE_BASE}/{i}/", headers={'User-Agent': 'Mozilla/5.0'})
|
||||
if req.status_code == 404:
|
||||
print(f"Page does not exist: {MOVIE_BASE}/{i}/. Breaking loop.")
|
||||
break
|
||||
main = req.text
|
||||
open(f"{CACHE_DIR}/main-{i}.html", "w+").write(main)
|
||||
|
||||
movies = []
|
||||
soup = BeautifulSoup(main, "html.parser")
|
||||
for h3 in soup.find_all("h3"):
|
||||
a = h3.findChildren("a", href=True)[0]
|
||||
movie_link = a.get("href")
|
||||
movie_title = a.text
|
||||
movies.append((movie_title, movie_link))
|
||||
|
||||
for movie in movies:
|
||||
if os.path.isfile(f"{CACHE_DIR}{movie[1]}html.html"):
|
||||
print(f"Reading movie page({movie[0]}) from cache...")
|
||||
main = open(f"{CACHE_DIR}{movie[1]}html.html").read()
|
||||
else:
|
||||
print(f"Requesting movie releases: {BASE_URL}{movie[1]}")
|
||||
req = requests.get(f"{BASE_URL}{movie[1]}", headers={'User-Agent': 'Mozilla/5.0'})
|
||||
main = req.text
|
||||
if not os.path.exists(f"{CACHE_DIR}{movie[1]}"):
|
||||
os.makedirs(f"{CACHE_DIR}{movie[1]}")
|
||||
open(f"{CACHE_DIR}{movie[1]}html.html", "w+").write(main)
|
||||
soup = BeautifulSoup(main, "html.parser")
|
||||
for href in soup.find_all("a"):
|
||||
if href.get("href").startswith("/torrent/"):
|
||||
links.append((href.get("href"), movie[0]))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return links
|
||||
|
||||
def process_links(links):
|
||||
print(f"Checking links...({len(links)})")
|
||||
counter = 1
|
||||
for link in links:
|
||||
try:
|
||||
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
|
||||
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
|
||||
torrent_html = req.text
|
||||
t = {}
|
||||
soup = BeautifulSoup(torrent_html, "html.parser")
|
||||
t['title'] = soup.find("h1").text.strip()
|
||||
t['size'] = 0
|
||||
t['magnets'] = []
|
||||
t['torrents'] = []
|
||||
all_a = soup.find_all("a")
|
||||
for a in all_a:
|
||||
if a.get("href").startswith("https://www.imdb.com/title"):
|
||||
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
|
||||
if a.get("href").startswith("magnet:"):
|
||||
t['magnets'].append(a.get("href"))
|
||||
if a.get("href").startswith(TORRENT_CACHES):
|
||||
t['torrents'].append(a.get("href"))
|
||||
all_li = soup.find_all("li")
|
||||
for li in all_li:
|
||||
if "Total size" in li.text:
|
||||
size = li.findChildren("span")[0].text
|
||||
mb = False
|
||||
if "MB" in size: mb = True
|
||||
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
|
||||
if mb:
|
||||
t['size'] = math.trunc(float(size) * 107374182)
|
||||
else:
|
||||
t['size'] = math.trunc(float(size) * 1073741824)
|
||||
t['seeders'] = soup.find("span", {"class": "seeds"}).text
|
||||
all_p = soup.find_all("p")
|
||||
for p in all_p:
|
||||
if "Infohash :" in p.text:
|
||||
t['infoHash'] = p.findChildren("span")[0].text.lower()
|
||||
t['files'] = []
|
||||
file_div = soup.find("div", {"id":"files"})
|
||||
for li in file_div.findChildren("li"):
|
||||
f = re.sub('\s\(.*\)', '', li.text)
|
||||
t["files"].append(f)
|
||||
t['trackers'] = []
|
||||
tracker_div = soup.find("div", {"id":"tracker-list"})
|
||||
for tracker in tracker_div.findChildren("li"):
|
||||
t['trackers'].append(tracker.text.strip())
|
||||
if not 'imdbid' in t or t['imdbid'] == '':
|
||||
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
|
||||
if found is not None:
|
||||
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
|
||||
else:
|
||||
new_id = imdb_find(link[1])
|
||||
if new_id is not None:
|
||||
t['imdbid'] = f"tt{new_id}"
|
||||
else:
|
||||
print(f"{t['title']} has no IMDB Id")
|
||||
continue
|
||||
build_and_write(t)
|
||||
except:
|
||||
counter += 1
|
||||
continue
|
||||
counter += 1
|
||||
@@ -1,41 +0,0 @@
|
||||
import os
|
||||
import threading
|
||||
from p1337x import process_links, get_links_initial, get_links_and_process
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from shared import read_config
|
||||
|
||||
print("Scaper starting...")
|
||||
|
||||
if os.path.isfile(".init"):
|
||||
print("Found init file, executing initial sync. Be patient.")
|
||||
process_links(get_links_initial())
|
||||
os.remove(".init")
|
||||
|
||||
sched = BackgroundScheduler(timezone="America/New_York")
|
||||
sched.start()
|
||||
|
||||
# 1337x
|
||||
PROVIDER = "1337x"
|
||||
pages = read_config(PROVIDER, "urls_to_scrape")
|
||||
interval = read_config(PROVIDER, "scrape_interval")
|
||||
for page in pages:
|
||||
j = sched.add_job(
|
||||
get_links_and_process,
|
||||
'interval',
|
||||
days=interval["days"],
|
||||
hours=interval["hours"],
|
||||
minutes=interval["minutes"],
|
||||
seconds=interval["seconds"],
|
||||
id=page,
|
||||
args=[page],
|
||||
max_instances=1)
|
||||
print(f"{page} willl be scraped {j.next_run_time}.")
|
||||
|
||||
|
||||
# Wait forever
|
||||
main_thread = threading.main_thread()
|
||||
while True:
|
||||
L = threading.enumerate()
|
||||
L.remove(main_thread) # or avoid it in the for loop
|
||||
for t in L:
|
||||
t.join()
|
||||
@@ -1,73 +0,0 @@
|
||||
import sqlite3
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
from imdb import Cinemagoer
|
||||
|
||||
SQLITE_PATH = "/sqlite/torrentio.sqlite"
|
||||
|
||||
ia = Cinemagoer()
|
||||
CONFIG = "config.json"
|
||||
|
||||
def read_config(provider, key):
|
||||
if os.path.isfile(CONFIG):
|
||||
f = open(CONFIG, "r")
|
||||
cfg = json.load(f)
|
||||
return cfg[provider][key]
|
||||
|
||||
def filter_file(file):
|
||||
allowed_ext = ['.mp4', '.mkv', '.avi', '.mpeg', '.mpg', '.mpv', '.mov']
|
||||
if os.path.splitext(file)[1] in allowed_ext:
|
||||
return True
|
||||
return False
|
||||
|
||||
def create_connection(db_file):
|
||||
conn = None
|
||||
try:
|
||||
conn = sqlite3.connect(db_file, check_same_thread=False)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
return conn
|
||||
|
||||
sqlite = create_connection(SQLITE_PATH)
|
||||
|
||||
def build_and_write(torrent):
|
||||
try:
|
||||
print(f"Recording {torrent['title']} in the database")
|
||||
q = f"INSERT OR REPLACE INTO torrents (infoHash, provider, title, size, type, uploadDate, seeders, trackers) VALUES (?,?,?,?,?,?,?,?)"
|
||||
p = (torrent['infoHash'],'1337x',torrent['title'],torrent['size'],'movie','1/1/2024',torrent['seeders'],','.join(torrent['trackers']))
|
||||
cursor = sqlite.cursor()
|
||||
cursor.execute(q,p)
|
||||
for file in torrent['files']:
|
||||
if filter_file(file):
|
||||
q = f"INSERT OR REPLACE INTO files (infoHash, fileIndex, title, size, imdbId) VALUES (?,?,?,?,?)"
|
||||
p = (torrent['infoHash'], torrent['files'].index(file), file, torrent['size'], torrent['imdbid'])
|
||||
cursor.execute(q,p)
|
||||
sqlite.commit()
|
||||
cursor.close()
|
||||
except sqlite3.Error as error:
|
||||
print(error)
|
||||
|
||||
def imdb_find(name):
|
||||
movie = ia.search_movie(name)
|
||||
if len(movie) >= 1:
|
||||
return movie[0].movieID
|
||||
return None
|
||||
|
||||
def extract_title(filename):
|
||||
try:
|
||||
filename.strip()
|
||||
filename = filename.replace('.', ' ')
|
||||
res = re.search('([^\\\]+)\.(avi|mkv|mpeg|mpg|mov|mp4)$', filename)
|
||||
if res:
|
||||
filename = res.group(1)
|
||||
res = re.search('(.*?)(dvdrip|xvid| cd[0-9]|dvdscr|brrip|divx|[\{\(\[]?[0-9]{4}).*', filename)
|
||||
if res:
|
||||
filename = res.group(1)
|
||||
res = re.search('(.*?)\(.*\)(.*)', filename)
|
||||
if res:
|
||||
filename = res.group(1)
|
||||
return filename
|
||||
except:
|
||||
return ""
|
||||
@@ -1,44 +0,0 @@
|
||||
CREATE TABLE "torrents" (
|
||||
"infoHash" TEXT,
|
||||
"provider" TEXT NOT NULL,
|
||||
"torrentId" TEXT,
|
||||
"title" TEXT NOT NULL,
|
||||
"size" INTEGER,
|
||||
"type" TEXT NOT NULL,
|
||||
"uploadDate" TEXT NOT NULL,
|
||||
"seeders" INTEGER,
|
||||
"trackers" TEXT,
|
||||
"languages" TEXT,
|
||||
"resolution" TEXT,
|
||||
"createdAt" TEXT,
|
||||
"updatedAt" TEXT,
|
||||
PRIMARY KEY("infoHash")
|
||||
);
|
||||
|
||||
CREATE TABLE "files" (
|
||||
"id" INTEGER,
|
||||
"infoHash" TEXT NOT NULL,
|
||||
"fileIndex" TEXT,
|
||||
"title" INTEGER,
|
||||
"size" INTEGER,
|
||||
"imdbId" TEXT,
|
||||
"imdbSeason" INTEGER,
|
||||
"imdbEpisode" INTEGER,
|
||||
"kitsuId" INTEGER,
|
||||
"kitsuEpisode" INTEGER,
|
||||
"createdAt" TEXT,
|
||||
"updatedAt" TEXT,
|
||||
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE,
|
||||
PRIMARY KEY("id" AUTOINCREMENT)
|
||||
UNIQUE(infoHash, fileIndex)
|
||||
);
|
||||
|
||||
CREATE TABLE "subtitles" (
|
||||
"infoHash" TEXT NOT NULL,
|
||||
"fileIndex" INTEGER NOT NULL,
|
||||
"fileId" INTEGER,
|
||||
"title" TEXT NOT NULL,
|
||||
"size" INTEGER NOT NULL,
|
||||
FOREIGN KEY("fileId") REFERENCES "file"("id") ON DELETE SET NULL
|
||||
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE
|
||||
);
|
||||
1384
addon/package-lock.json
generated
1384
addon/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -23,14 +23,14 @@
|
||||
"named-queue": "^2.2.1",
|
||||
"offcloud-api": "^1.0.2",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#022408972c2a040f846331a912a6a8487746a654",
|
||||
"pg": "^8.10.0",
|
||||
"pg": "^8.11.3",
|
||||
"pg-hstore": "^2.3.4",
|
||||
"premiumize-api": "^1.0.3",
|
||||
"prom-client": "^12.0.0",
|
||||
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#d1f7eaa8593b947edbfbc8a92a176448b48ef445",
|
||||
"request-ip": "^3.3.0",
|
||||
"router": "^1.3.8",
|
||||
"sequelize": "^6.31.1",
|
||||
"sqlite3": "^5.1.7",
|
||||
"stremio-addon-sdk": "^1.6.10",
|
||||
"swagger-stats": "^0.99.7",
|
||||
"ua-parser-js": "^1.0.36",
|
||||
|
||||
12
scraper/Dockerfile
Normal file
12
scraper/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
FROM node:16-alpine
|
||||
|
||||
RUN apk update && apk upgrade && \
|
||||
apk add --no-cache git
|
||||
|
||||
WORKDIR /home/node/app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --only-production
|
||||
COPY . .
|
||||
|
||||
CMD [ "node", "--insecure-http-parser", "index.js" ]
|
||||
37
scraper/README.md
Normal file
37
scraper/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Torrentio Scraper
|
||||
|
||||
## Initial dumps
|
||||
|
||||
### The Pirate Bay
|
||||
|
||||
https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA
|
||||
|
||||
https://thepiratebay.org/static/dump/csv/
|
||||
|
||||
### Kickass
|
||||
|
||||
https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA
|
||||
|
||||
https://web.archive.org/web/20150416071329/http://kickass.to/api
|
||||
|
||||
### RARBG
|
||||
|
||||
Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/) for available `imdbIds` and use those via the api to search for torrents.
|
||||
|
||||
Movies sitemap
|
||||
```json
|
||||
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
|
||||
```
|
||||
|
||||
TV sitemap
|
||||
```json
|
||||
{"_id":"rarbg-tv","startUrl":["https://rarbgmirror.org/catalog/tv/[1-609]"],"selectors":[{"id":"rarbg-tv-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
|
||||
```
|
||||
|
||||
### Migrating Database
|
||||
|
||||
When migrating database to a new one it is important to alter the `files_id_seq` sequence to the maximum file id value plus 1.
|
||||
|
||||
```sql
|
||||
ALTER SEQUENCE files_id_seq RESTART WITH <last_file_id + 1>;
|
||||
```
|
||||
14
scraper/index.js
Normal file
14
scraper/index.js
Normal file
@@ -0,0 +1,14 @@
|
||||
const express = require("express");
|
||||
const server = express();
|
||||
const { connect } = require('./lib/repository');
|
||||
const { startScraper } = require('./scheduler/scheduler')
|
||||
|
||||
server.get('/', function (req, res) {
|
||||
res.sendStatus(200);
|
||||
});
|
||||
|
||||
server.listen(process.env.PORT || 7000, async () => {
|
||||
await connect();
|
||||
console.log('Scraper started');
|
||||
startScraper();
|
||||
});
|
||||
72
scraper/lib/cache.js
Normal file
72
scraper/lib/cache.js
Normal file
@@ -0,0 +1,72 @@
|
||||
const cacheManager = require('cache-manager');
|
||||
const mangodbStore = require('cache-manager-mongodb');
|
||||
|
||||
const GLOBAL_KEY_PREFIX = 'stremio-torrentio';
|
||||
const IMDB_ID_PREFIX = `${GLOBAL_KEY_PREFIX}|imdb_id`;
|
||||
const KITSU_ID_PREFIX = `${GLOBAL_KEY_PREFIX}|kitsu_id`;
|
||||
const METADATA_PREFIX = `${GLOBAL_KEY_PREFIX}|metadata`;
|
||||
const TRACKERS_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|trackers`;
|
||||
|
||||
const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days
|
||||
const MEMORY_TTL = process.env.METADATA_TTL || 2 * 60 * 60; // 2 hours
|
||||
const TRACKERS_TTL = 2 * 24 * 60 * 60; // 2 days
|
||||
|
||||
const MONGO_URI = process.env.MONGODB_URI;
|
||||
|
||||
const memoryCache = initiateMemoryCache();
|
||||
const remoteCache = initiateRemoteCache();
|
||||
|
||||
function initiateRemoteCache() {
|
||||
if (MONGO_URI) {
|
||||
return cacheManager.caching({
|
||||
store: mangodbStore,
|
||||
uri: MONGO_URI,
|
||||
options: {
|
||||
collection: 'torrentio_scraper_collection',
|
||||
useUnifiedTopology: true
|
||||
},
|
||||
ttl: GLOBAL_TTL,
|
||||
ignoreCacheErrors: true
|
||||
});
|
||||
} else {
|
||||
return cacheManager.caching({
|
||||
store: 'memory',
|
||||
ttl: GLOBAL_TTL
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function initiateMemoryCache() {
|
||||
return cacheManager.caching({
|
||||
store: 'memory',
|
||||
ttl: MEMORY_TTL
|
||||
});
|
||||
}
|
||||
|
||||
function cacheWrap(cache, key, method, options) {
|
||||
return cache.wrap(key, method, options);
|
||||
}
|
||||
|
||||
function cacheWrapImdbId(key, method) {
|
||||
return cacheWrap(remoteCache, `${IMDB_ID_PREFIX}:${key}`, method, { ttl: GLOBAL_TTL });
|
||||
}
|
||||
|
||||
function cacheWrapKitsuId(key, method) {
|
||||
return cacheWrap(remoteCache, `${KITSU_ID_PREFIX}:${key}`, method, { ttl: GLOBAL_TTL });
|
||||
}
|
||||
|
||||
function cacheWrapMetadata(id, method) {
|
||||
return cacheWrap(memoryCache, `${METADATA_PREFIX}:${id}`, method, { ttl: MEMORY_TTL });
|
||||
}
|
||||
|
||||
function cacheTrackers(method) {
|
||||
return cacheWrap(memoryCache, `${TRACKERS_KEY_PREFIX}`, method, { ttl: TRACKERS_TTL });
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
cacheWrapImdbId,
|
||||
cacheWrapKitsuId,
|
||||
cacheWrapMetadata,
|
||||
cacheTrackers
|
||||
};
|
||||
|
||||
64
scraper/lib/extension.js
Normal file
64
scraper/lib/extension.js
Normal file
@@ -0,0 +1,64 @@
|
||||
const VIDEO_EXTENSIONS = [
|
||||
"3g2",
|
||||
"3gp",
|
||||
"avi",
|
||||
"flv",
|
||||
"mkv",
|
||||
"mk3d",
|
||||
"mov",
|
||||
"mp2",
|
||||
"mp4",
|
||||
"m4v",
|
||||
"mpe",
|
||||
"mpeg",
|
||||
"mpg",
|
||||
"mpv",
|
||||
"webm",
|
||||
"wmv",
|
||||
"ogm",
|
||||
"divx"
|
||||
];
|
||||
const SUBTITLE_EXTENSIONS = [
|
||||
"aqt",
|
||||
"gsub",
|
||||
"jss",
|
||||
"sub",
|
||||
"ttxt",
|
||||
"pjs",
|
||||
"psb",
|
||||
"rt",
|
||||
"smi",
|
||||
"slt",
|
||||
"ssf",
|
||||
"srt",
|
||||
"ssa",
|
||||
"ass",
|
||||
"usf",
|
||||
"idx",
|
||||
"vtt"
|
||||
];
|
||||
const DISK_EXTENSIONS = [
|
||||
"iso",
|
||||
"m2ts",
|
||||
"ts",
|
||||
"vob"
|
||||
]
|
||||
|
||||
function isVideo(filename) {
|
||||
return isExtension(filename, VIDEO_EXTENSIONS);
|
||||
}
|
||||
|
||||
function isSubtitle(filename) {
|
||||
return isExtension(filename, SUBTITLE_EXTENSIONS);
|
||||
}
|
||||
|
||||
function isDisk(filename) {
|
||||
return isExtension(filename, DISK_EXTENSIONS);
|
||||
}
|
||||
|
||||
function isExtension(filename, extensions) {
|
||||
const extensionMatch = filename.match(/\.(\w{2,4})$/);
|
||||
return extensionMatch && extensions.includes(extensionMatch[1].toLowerCase());
|
||||
}
|
||||
|
||||
module.exports = { isVideo, isSubtitle, isDisk }
|
||||
155
scraper/lib/metadata.js
Normal file
155
scraper/lib/metadata.js
Normal file
@@ -0,0 +1,155 @@
|
||||
const needle = require('needle');
|
||||
const nameToImdb = require('name-to-imdb');
|
||||
const googleIt = require('google-it');
|
||||
const googleSr = require('google-sr');
|
||||
const bing = require('nodejs-bing');
|
||||
const he = require('he');
|
||||
const { cacheWrapImdbId, cacheWrapKitsuId, cacheWrapMetadata } = require('./cache');
|
||||
const { Type } = require('./types');
|
||||
const { getRandomUserAgent } = require('./requestHelper');
|
||||
|
||||
const CINEMETA_URL = 'https://v3-cinemeta.strem.io';
|
||||
const KITSU_URL = 'https://anime-kitsu.strem.fun';
|
||||
const TIMEOUT = 20000;
|
||||
|
||||
function getMetadata(id, type = Type.SERIES) {
|
||||
if (!id) {
|
||||
return Promise.reject("no valid id provided");
|
||||
}
|
||||
|
||||
const key = Number.isInteger(id) || id.match(/^\d+$/) ? `kitsu:${id}` : id;
|
||||
const metaType = type === Type.MOVIE ? Type.MOVIE : Type.SERIES;
|
||||
return cacheWrapMetadata(key, () => _requestMetadata(`${KITSU_URL}/meta/${metaType}/${key}.json`)
|
||||
.catch(() => _requestMetadata(`${CINEMETA_URL}/meta/${metaType}/${key}.json`))
|
||||
.catch(() => {
|
||||
// try different type in case there was a mismatch
|
||||
const otherType = metaType === Type.MOVIE ? Type.SERIES : Type.MOVIE;
|
||||
return _requestMetadata(`${CINEMETA_URL}/meta/${otherType}/${key}.json`)
|
||||
})
|
||||
.catch((error) => {
|
||||
throw new Error(`failed metadata query ${key} due: ${error.message}`);
|
||||
}));
|
||||
}
|
||||
|
||||
function _requestMetadata(url) {
|
||||
return needle('get', url, { open_timeout: TIMEOUT })
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (body && body.meta && (body.meta.imdb_id || body.meta.kitsu_id)) {
|
||||
return {
|
||||
kitsuId: body.meta.kitsu_id,
|
||||
imdbId: body.meta.imdb_id,
|
||||
type: body.meta.type,
|
||||
title: body.meta.name,
|
||||
year: body.meta.year,
|
||||
country: body.meta.country,
|
||||
genres: body.meta.genres,
|
||||
status: body.meta.status,
|
||||
videos: (body.meta.videos || [])
|
||||
.map((video) => Number.isInteger(video.imdbSeason)
|
||||
? {
|
||||
name: video.name || video.title,
|
||||
season: video.season,
|
||||
episode: video.episode,
|
||||
imdbSeason: video.imdbSeason,
|
||||
imdbEpisode: video.imdbEpisode
|
||||
}
|
||||
: {
|
||||
name: video.name || video.title,
|
||||
season: video.season,
|
||||
episode: video.episode,
|
||||
kitsuId: video.kitsu_id,
|
||||
kitsuEpisode: video.kitsuEpisode,
|
||||
released: video.released
|
||||
}
|
||||
),
|
||||
episodeCount: Object.values((body.meta.videos || [])
|
||||
.filter((entry) => entry.season !== 0 && entry.episode !== 0)
|
||||
.sort((a, b) => a.season - b.season)
|
||||
.reduce((map, next) => {
|
||||
map[next.season] = map[next.season] + 1 || 1;
|
||||
return map;
|
||||
}, {})),
|
||||
totalCount: body.meta.videos && body.meta.videos
|
||||
.filter((entry) => entry.season !== 0 && entry.episode !== 0).length
|
||||
};
|
||||
} else {
|
||||
throw new Error('No search results');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function escapeTitle(title) {
|
||||
return title.toLowerCase()
|
||||
.normalize('NFKD') // normalize non-ASCII characters
|
||||
.replace(/[\u0300-\u036F]/g, '')
|
||||
.replace(/&/g, 'and')
|
||||
.replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces
|
||||
.replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars
|
||||
.replace(/^\d{1,2}[.#\s]+(?=(?:\d+[.\s]*)?[\u0400-\u04ff])/i, '') // remove russian movie numbering
|
||||
.replace(/\s{2,}/, ' ') // replace multiple spaces
|
||||
.trim();
|
||||
}
|
||||
|
||||
function escapeHTML(title) {
|
||||
return he.decode(title)
|
||||
.replace(/&/g, "&");
|
||||
}
|
||||
|
||||
async function getImdbId(info, type) {
|
||||
const name = escapeTitle(info.title);
|
||||
const year = info.year || info.date && info.date.slice(0, 4);
|
||||
const key = `${name}_${year}_${type}`;
|
||||
const query = `${name} ${year || ''} ${type} imdb`;
|
||||
|
||||
return cacheWrapImdbId(key,
|
||||
() => new Promise((resolve, reject) => {
|
||||
nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) {
|
||||
if (res) {
|
||||
resolve(res);
|
||||
} else {
|
||||
reject(err || new Error('failed imdbId search'));
|
||||
}
|
||||
});
|
||||
// }).catch(() => googleIt({ query, userAgent: getRandomUserAgent(), disableConsole: true })
|
||||
// .then(results => results.length ? results : Promise.reject('No results'))
|
||||
}).catch(() => googleSr(query)
|
||||
.then(response => response.searchResults.length ? response.searchResults : Promise.reject('No results'))
|
||||
// .catch(() => bing.web(query))
|
||||
.then(results => results
|
||||
.map(result => result.link)
|
||||
.find(result => result.includes('imdb.com/title/')))
|
||||
.then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/))
|
||||
.then(match => match && match[1])))
|
||||
.then(imdbId => imdbId && 'tt' + imdbId.replace(/tt0*([1-9][0-9]*)$/, '$1').padStart(7, '0'));
|
||||
}
|
||||
|
||||
async function getKitsuId(info) {
|
||||
const title = escapeTitle(info.title.replace(/\s\|\s.*/, ''));
|
||||
const year = info.year ? ` ${info.year}` : '';
|
||||
const season = info.season > 1 ? ` S${info.season}` : '';
|
||||
const key = `${title}${year}${season}`;
|
||||
const query = encodeURIComponent(key);
|
||||
|
||||
return cacheWrapKitsuId(key,
|
||||
() => needle('get', `${KITSU_URL}/catalog/series/kitsu-anime-list/search=${query}.json`, { open_timeout: 60000 })
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (body && body.metas && body.metas.length) {
|
||||
return body.metas[0].id.replace('kitsu:', '');
|
||||
} else {
|
||||
throw new Error('No search results');
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
async function isEpisodeImdbId(imdbId) {
|
||||
if (!imdbId) {
|
||||
return false;
|
||||
}
|
||||
return needle('get', `https://www.imdb.com/title/${imdbId}/`, { open_timeout: 10000, follow: 2 })
|
||||
.then(response => !!(response.body && response.body.includes('video.episode')))
|
||||
.catch((err) => false);
|
||||
}
|
||||
|
||||
module.exports = { getMetadata, getImdbId, getKitsuId, isEpisodeImdbId, escapeHTML, escapeTitle };
|
||||
100
scraper/lib/parseHelper.js
Normal file
100
scraper/lib/parseHelper.js
Normal file
@@ -0,0 +1,100 @@
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const { Type } = require('./types');
|
||||
|
||||
const MULTIPLE_FILES_SIZE = 4 * 1024 * 1024 * 1024; // 4 GB
|
||||
|
||||
function parseSeriesVideos(torrent, videos) {
|
||||
const parsedTorrentName = parse(torrent.title);
|
||||
const hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/i);
|
||||
const parsedVideos = videos.map(video => parseSeriesVideo(video, parsedTorrentName));
|
||||
return parsedVideos.map(video => ({ ...video, isMovie: isMovieVideo(video, parsedVideos, torrent.type, hasMovies) }));
|
||||
}
|
||||
|
||||
function parseSeriesVideo(video, parsedTorrentName) {
|
||||
const videoInfo = parse(video.name);
|
||||
// the episode may be in a folder containing season number
|
||||
if (!Number.isInteger(videoInfo.season) && video.path.includes('/')) {
|
||||
const folders = video.path.split('/');
|
||||
const pathInfo = parse(folders[folders.length - 2]);
|
||||
videoInfo.season = pathInfo.season;
|
||||
}
|
||||
if (!Number.isInteger(videoInfo.season) && parsedTorrentName.season) {
|
||||
videoInfo.season = parsedTorrentName.season;
|
||||
}
|
||||
if (!Number.isInteger(videoInfo.season) && videoInfo.seasons && videoInfo.seasons.length > 1) {
|
||||
// in case single file was interpreted as having multiple seasons
|
||||
videoInfo.season = videoInfo.seasons[0];
|
||||
}
|
||||
if (!Number.isInteger(videoInfo.season) && video.path.includes('/') && parsedTorrentName.seasons
|
||||
&& parsedTorrentName.seasons.length > 1) {
|
||||
// russian season are usually named with 'series name-2` i.e. Улицы разбитых фонарей-6/22. Одиночный выстрел.mkv
|
||||
const folderPathSeasonMatch = video.path.match(/[\u0400-\u04ff]-(\d{1,2})(?=.*\/)/);
|
||||
videoInfo.season = folderPathSeasonMatch && parseInt(folderPathSeasonMatch[1], 10) || undefined;
|
||||
}
|
||||
// sometimes video file does not have correct date format as in torrent title
|
||||
if (!videoInfo.episodes && !videoInfo.date && parsedTorrentName.date) {
|
||||
videoInfo.date = parsedTorrentName.date;
|
||||
}
|
||||
// limit number of episodes in case of incorrect parsing
|
||||
if (videoInfo.episodes && videoInfo.episodes.length > 20) {
|
||||
videoInfo.episodes = [videoInfo.episodes[0]];
|
||||
videoInfo.episode = videoInfo.episodes[0];
|
||||
}
|
||||
// force episode to any found number if it was not parsed
|
||||
if (!videoInfo.episodes && !videoInfo.date) {
|
||||
const epMatcher = videoInfo.title.match(
|
||||
/(?<!season\W*|disk\W*|movie\W*|film\W*)(?:^|\W)(\d{1,4})(?:a|b|c|v\d)?(?:\W|$)(?!disk|movie|film)/i);
|
||||
videoInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
|
||||
videoInfo.episode = videoInfo.episodes && videoInfo.episodes[0];
|
||||
}
|
||||
if (!videoInfo.episodes && !videoInfo.date) {
|
||||
const epMatcher = video.name.match(new RegExp(`(?:\\(${videoInfo.year}\\)|part)[._ ]?(\\d{1,3})(?:\\b|_)`, "i"));
|
||||
videoInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
|
||||
videoInfo.episode = videoInfo.episodes && videoInfo.episodes[0];
|
||||
}
|
||||
|
||||
return { ...video, ...videoInfo };
|
||||
}
|
||||
|
||||
function isMovieVideo(video, otherVideos, type, hasMovies) {
|
||||
if (Number.isInteger(video.season) && Array.isArray(video.episodes)) {
|
||||
// not movie if video has season
|
||||
return false;
|
||||
}
|
||||
if (video.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i)) {
|
||||
// movie if video explicitly has numbered movie keyword in the name, ie. 1 Movie or Movie 1
|
||||
return true;
|
||||
}
|
||||
if (!hasMovies && type !== Type.ANIME) {
|
||||
// not movie if torrent name does not contain movies keyword or is not a pack torrent and is not anime
|
||||
return false;
|
||||
}
|
||||
if (!video.episodes) {
|
||||
// movie if there's no episode info it could be a movie
|
||||
return true;
|
||||
}
|
||||
// movie if contains year info and there aren't more than 3 video with same title and year
|
||||
// as some series titles might contain year in it.
|
||||
return !!video.year
|
||||
&& otherVideos.length > 3
|
||||
&& otherVideos.filter(other => other.title === video.title && other.year === video.year) < 3;
|
||||
}
|
||||
|
||||
function isPackTorrent(torrent) {
|
||||
if (torrent.pack) {
|
||||
return true;
|
||||
}
|
||||
const parsedInfo = parse(torrent.title);
|
||||
if (torrent.type === Type.MOVIE) {
|
||||
return parsedInfo.complete || typeof parsedInfo.year === 'string' || /movies/i.test(torrent.title);
|
||||
}
|
||||
const hasMultipleEpisodes = parsedInfo.complete ||
|
||||
torrent.size > MULTIPLE_FILES_SIZE ||
|
||||
(parsedInfo.seasons && parsedInfo.seasons.length > 1) ||
|
||||
(parsedInfo.episodes && parsedInfo.episodes.length > 1) ||
|
||||
(parsedInfo.seasons && !parsedInfo.episodes);
|
||||
const hasSingleEpisode = Number.isInteger(parsedInfo.episode) || (!parsedInfo.episodes && parsedInfo.date);
|
||||
return hasMultipleEpisodes && !hasSingleEpisode;
|
||||
}
|
||||
|
||||
module.exports = { parseSeriesVideos, isPackTorrent }
|
||||
57
scraper/lib/promises.js
Normal file
57
scraper/lib/promises.js
Normal file
@@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Execute promises in sequence one after another.
|
||||
*/
|
||||
async function sequence(promises) {
|
||||
return promises.reduce((promise, func) =>
|
||||
promise.then(result => func().then(Array.prototype.concat.bind(result))), Promise.resolve([]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return first resolved promise as the result.
|
||||
*/
|
||||
async function first(promises) {
|
||||
return Promise.all(promises.map((p) => {
|
||||
// If a request fails, count that as a resolution so it will keep
|
||||
// waiting for other possible successes. If a request succeeds,
|
||||
// treat it as a rejection so Promise.all immediately bails out.
|
||||
return p.then(
|
||||
(val) => Promise.reject(val),
|
||||
(err) => Promise.resolve(err)
|
||||
);
|
||||
})).then(
|
||||
// If '.all' resolved, we've just got an array of errors.
|
||||
(errors) => Promise.reject(errors),
|
||||
// If '.all' rejected, we've got the result we wanted.
|
||||
(val) => Promise.resolve(val)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delay promise
|
||||
*/
|
||||
async function delay(duration) {
|
||||
return new Promise((resolve) => setTimeout(resolve, duration));
|
||||
}
|
||||
|
||||
/**
|
||||
* Timeout promise after a set time in ms
|
||||
*/
|
||||
async function timeout(timeoutMs, promise, message = 'Timed out') {
|
||||
return Promise.race([
|
||||
promise,
|
||||
new Promise(function (resolve, reject) {
|
||||
setTimeout(function () {
|
||||
reject(message);
|
||||
}, timeoutMs);
|
||||
})
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return most common value from given array.
|
||||
*/
|
||||
function mostCommonValue(array) {
|
||||
return array.sort((a, b) => array.filter(v => v === a).length - array.filter(v => v === b).length).pop();
|
||||
}
|
||||
|
||||
module.exports = { sequence, first, delay, timeout, mostCommonValue };
|
||||
346
scraper/lib/repository.js
Normal file
346
scraper/lib/repository.js
Normal file
@@ -0,0 +1,346 @@
|
||||
const moment = require('moment');
|
||||
const Promises = require('./promises')
|
||||
const { Sequelize, DataTypes, fn, col, literal } = require('sequelize');
|
||||
const Op = Sequelize.Op;
|
||||
|
||||
const DATABASE_URI = process.env.DATABASE_URI;
|
||||
|
||||
const database = new Sequelize(
|
||||
DATABASE_URI,
|
||||
{
|
||||
logging: false
|
||||
}
|
||||
);
|
||||
|
||||
const Provider = database.define('provider', {
|
||||
name: { type: DataTypes.STRING(32), primaryKey: true },
|
||||
lastScraped: { type: DataTypes.DATE },
|
||||
lastScrapedId: { type: DataTypes.STRING(128) }
|
||||
});
|
||||
|
||||
const Torrent = database.define('torrent',
|
||||
{
|
||||
infoHash: { type: DataTypes.STRING(64), primaryKey: true },
|
||||
provider: { type: DataTypes.STRING(32), allowNull: false },
|
||||
torrentId: { type: DataTypes.STRING(512) },
|
||||
title: { type: DataTypes.STRING(512), allowNull: false },
|
||||
size: { type: DataTypes.BIGINT },
|
||||
type: { type: DataTypes.STRING(16), allowNull: false },
|
||||
uploadDate: { type: DataTypes.DATE, allowNull: false },
|
||||
seeders: { type: DataTypes.SMALLINT },
|
||||
trackers: { type: DataTypes.STRING(4096) },
|
||||
languages: { type: DataTypes.STRING(4096) },
|
||||
resolution: { type: DataTypes.STRING(16) },
|
||||
reviewed: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: false },
|
||||
opened: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: false }
|
||||
}
|
||||
);
|
||||
|
||||
const File = database.define('file',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT, autoIncrement: true, primaryKey: true },
|
||||
infoHash: {
|
||||
type: DataTypes.STRING(64),
|
||||
allowNull: false,
|
||||
references: { model: Torrent, key: 'infoHash' },
|
||||
onDelete: 'CASCADE'
|
||||
},
|
||||
fileIndex: { type: DataTypes.INTEGER },
|
||||
title: { type: DataTypes.STRING(512), allowNull: false },
|
||||
size: { type: DataTypes.BIGINT },
|
||||
imdbId: { type: DataTypes.STRING(32) },
|
||||
imdbSeason: { type: DataTypes.INTEGER },
|
||||
imdbEpisode: { type: DataTypes.INTEGER },
|
||||
kitsuId: { type: DataTypes.INTEGER },
|
||||
kitsuEpisode: { type: DataTypes.INTEGER }
|
||||
},
|
||||
{
|
||||
indexes: [
|
||||
{
|
||||
unique: true,
|
||||
name: 'files_unique_file_constraint',
|
||||
fields: [
|
||||
col('infoHash'),
|
||||
fn('COALESCE', (col('fileIndex')), -1),
|
||||
fn('COALESCE', (col('imdbId')), 'null'),
|
||||
fn('COALESCE', (col('imdbSeason')), -1),
|
||||
fn('COALESCE', (col('imdbEpisode')), -1),
|
||||
fn('COALESCE', (col('kitsuId')), -1),
|
||||
fn('COALESCE', (col('kitsuEpisode')), -1)
|
||||
]
|
||||
},
|
||||
{ unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] },
|
||||
{ unique: false, fields: ['kitsuId', 'kitsuEpisode'] }
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
const Subtitle = database.define('subtitle',
|
||||
{
|
||||
infoHash: {
|
||||
type: DataTypes.STRING(64),
|
||||
allowNull: false,
|
||||
references: { model: Torrent, key: 'infoHash' },
|
||||
onDelete: 'CASCADE'
|
||||
},
|
||||
fileIndex: {
|
||||
type: DataTypes.INTEGER,
|
||||
allowNull: false
|
||||
},
|
||||
fileId: {
|
||||
type: DataTypes.BIGINT,
|
||||
allowNull: true,
|
||||
references: { model: File, key: 'id' },
|
||||
onDelete: 'SET NULL'
|
||||
},
|
||||
title: { type: DataTypes.STRING(512), allowNull: false },
|
||||
},
|
||||
{
|
||||
timestamps: false,
|
||||
indexes: [
|
||||
{
|
||||
unique: true,
|
||||
name: 'subtitles_unique_subtitle_constraint',
|
||||
fields: [
|
||||
col('infoHash'),
|
||||
col('fileIndex'),
|
||||
fn('COALESCE', (col('fileId')), -1)
|
||||
]
|
||||
},
|
||||
{ unique: false, fields: ['fileId'] }
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
const Content = database.define('content',
|
||||
{
|
||||
infoHash: {
|
||||
type: DataTypes.STRING(64),
|
||||
primaryKey: true,
|
||||
allowNull: false,
|
||||
references: { model: Torrent, key: 'infoHash' },
|
||||
onDelete: 'CASCADE'
|
||||
},
|
||||
fileIndex: {
|
||||
type: DataTypes.INTEGER,
|
||||
primaryKey: true,
|
||||
allowNull: false
|
||||
},
|
||||
path: { type: DataTypes.STRING(512), allowNull: false },
|
||||
size: { type: DataTypes.BIGINT },
|
||||
},
|
||||
{
|
||||
timestamps: false,
|
||||
}
|
||||
);
|
||||
|
||||
const SkipTorrent = database.define('skip_torrent', {
|
||||
infoHash: { type: DataTypes.STRING(64), primaryKey: true },
|
||||
});
|
||||
|
||||
Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false });
|
||||
File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
|
||||
Torrent.hasMany(Content, { foreignKey: 'infoHash', constraints: false });
|
||||
Content.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
|
||||
File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false });
|
||||
Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false });
|
||||
|
||||
function connect() {
|
||||
if (process.env.ENABLE_SYNC) {
|
||||
return database.sync({ alter: true })
|
||||
.catch(error => {
|
||||
console.error('Failed syncing database: ', error);
|
||||
throw error;
|
||||
});
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
function getProvider(provider) {
|
||||
return Provider.findOrCreate({ where: { name: { [Op.eq]: provider.name } }, defaults: provider })
|
||||
.then((result) => result[0])
|
||||
.catch(() => provider);
|
||||
}
|
||||
|
||||
function getTorrent(torrent) {
|
||||
const where = torrent.infoHash
|
||||
? { infoHash: torrent.infoHash }
|
||||
: { provider: torrent.provider, torrentId: torrent.torrentId }
|
||||
return Torrent.findOne({ where: where });
|
||||
}
|
||||
|
||||
function getTorrentsBasedOnTitle(titleQuery, type) {
|
||||
return getTorrentsBasedOnQuery({ title: { [Op.regexp]: `${titleQuery}` }, type: type });
|
||||
}
|
||||
|
||||
function getTorrentsBasedOnQuery(where) {
|
||||
return Torrent.findAll({ where: where });
|
||||
}
|
||||
|
||||
function getTorrentsWithoutSize() {
|
||||
return Torrent.findAll({
|
||||
where: literal(
|
||||
'exists (select 1 from files where files."infoHash" = torrent."infoHash" and files.size = 300000000)'),
|
||||
order: [
|
||||
['seeders', 'DESC']
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
function getUpdateSeedersTorrents(limit = 50) {
|
||||
const until = moment().subtract(7, 'days').format('YYYY-MM-DD');
|
||||
return Torrent.findAll({
|
||||
where: literal(`torrent."updatedAt" < \'${until}\'`),
|
||||
limit: limit,
|
||||
order: [
|
||||
['seeders', 'DESC'],
|
||||
['updatedAt', 'ASC']
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
function getUpdateSeedersNewTorrents(limit = 50) {
|
||||
const lastUpdate = moment().subtract(12, 'hours').format('YYYY-MM-DD');
|
||||
const createdAfter = moment().subtract(4, 'days').format('YYYY-MM-DD');
|
||||
return Torrent.findAll({
|
||||
where: literal(`torrent."updatedAt" < \'${lastUpdate}\' AND torrent."createdAt" > \'${createdAfter}\'`),
|
||||
limit: limit,
|
||||
order: [
|
||||
['seeders', 'ASC'],
|
||||
['updatedAt', 'ASC']
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
function getNoContentsTorrents() {
|
||||
return Torrent.findAll({
|
||||
where: { opened: false, seeders: { [Op.gte]: 1 } },
|
||||
limit: 500,
|
||||
order: [[fn('RANDOM')]]
|
||||
});
|
||||
}
|
||||
|
||||
function createTorrent(torrent) {
|
||||
return Torrent.upsert(torrent)
|
||||
.then(() => createContents(torrent.infoHash, torrent.contents))
|
||||
.then(() => createSubtitles(torrent.infoHash, torrent.subtitles));
|
||||
}
|
||||
|
||||
function setTorrentSeeders(torrent, seeders) {
|
||||
const where = torrent.infoHash
|
||||
? { infoHash: torrent.infoHash }
|
||||
: { provider: torrent.provider, torrentId: torrent.torrentId }
|
||||
return Torrent.update(
|
||||
{ seeders: seeders },
|
||||
{ where: where }
|
||||
);
|
||||
}
|
||||
|
||||
function deleteTorrent(torrent) {
|
||||
return Torrent.destroy({ where: { infoHash: torrent.infoHash } })
|
||||
}
|
||||
|
||||
function createFile(file) {
|
||||
if (file.id) {
|
||||
return (file.dataValues ? file.save() : File.upsert(file))
|
||||
.then(() => upsertSubtitles(file, file.subtitles));
|
||||
}
|
||||
if (file.subtitles && file.subtitles.length) {
|
||||
file.subtitles = file.subtitles.map(subtitle => ({ infoHash: file.infoHash, title: subtitle.path, ...subtitle }));
|
||||
}
|
||||
return File.create(file, { include: [Subtitle], ignoreDuplicates: true });
|
||||
}
|
||||
|
||||
function getFiles(torrent) {
|
||||
return File.findAll({ where: { infoHash: torrent.infoHash } });
|
||||
}
|
||||
|
||||
function getFilesBasedOnTitle(titleQuery) {
|
||||
return File.findAll({ where: { title: { [Op.regexp]: `${titleQuery}` } } });
|
||||
}
|
||||
|
||||
function deleteFile(file) {
|
||||
return File.destroy({ where: { id: file.id } })
|
||||
}
|
||||
|
||||
function createSubtitles(infoHash, subtitles) {
|
||||
if (subtitles && subtitles.length) {
|
||||
return Subtitle.bulkCreate(subtitles.map(subtitle => ({ infoHash, title: subtitle.path, ...subtitle })));
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
function upsertSubtitles(file, subtitles) {
|
||||
if (file.id && subtitles && subtitles.length) {
|
||||
return Promises.sequence(subtitles
|
||||
.map(subtitle => {
|
||||
subtitle.fileId = file.id;
|
||||
subtitle.infoHash = subtitle.infoHash || file.infoHash;
|
||||
subtitle.title = subtitle.title || subtitle.path;
|
||||
return subtitle;
|
||||
})
|
||||
.map(subtitle => () => subtitle.dataValues ? subtitle.save() : Subtitle.create(subtitle)));
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
function getSubtitles(torrent) {
|
||||
return Subtitle.findAll({ where: { infoHash: torrent.infoHash } });
|
||||
}
|
||||
|
||||
function getUnassignedSubtitles() {
|
||||
return Subtitle.findAll({ where: { fileId: null } });
|
||||
}
|
||||
|
||||
function createContents(infoHash, contents) {
|
||||
if (contents && contents.length) {
|
||||
return Content.bulkCreate(contents.map(content => ({ infoHash, ...content })), { ignoreDuplicates: true })
|
||||
.then(() => Torrent.update({ opened: true }, { where: { infoHash: infoHash }, silent: true }));
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
function getContents(torrent) {
|
||||
return Content.findAll({ where: { infoHash: torrent.infoHash } });
|
||||
}
|
||||
|
||||
function getSkipTorrent(torrent) {
|
||||
return SkipTorrent.findByPk(torrent.infoHash)
|
||||
.then((result) => {
|
||||
if (!result) {
|
||||
throw new Error(`torrent not found: ${torrent.infoHash}`);
|
||||
}
|
||||
return result.dataValues;
|
||||
})
|
||||
}
|
||||
|
||||
function createSkipTorrent(torrent) {
|
||||
return SkipTorrent.upsert({ infoHash: torrent.infoHash });
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
connect,
|
||||
getProvider,
|
||||
createTorrent,
|
||||
setTorrentSeeders,
|
||||
getTorrent,
|
||||
getTorrentsBasedOnTitle,
|
||||
getTorrentsBasedOnQuery,
|
||||
deleteTorrent,
|
||||
getUpdateSeedersTorrents,
|
||||
getUpdateSeedersNewTorrents,
|
||||
getNoContentsTorrents,
|
||||
createFile,
|
||||
getFiles,
|
||||
getFilesBasedOnTitle,
|
||||
deleteFile,
|
||||
createSubtitles,
|
||||
upsertSubtitles,
|
||||
getSubtitles,
|
||||
getUnassignedSubtitles,
|
||||
createContents,
|
||||
getContents,
|
||||
getSkipTorrent,
|
||||
createSkipTorrent,
|
||||
getTorrentsWithoutSize
|
||||
};
|
||||
21
scraper/lib/requestHelper.js
Normal file
21
scraper/lib/requestHelper.js
Normal file
@@ -0,0 +1,21 @@
|
||||
const UserAgent = require('user-agents');
|
||||
const userAgent = new UserAgent();
|
||||
|
||||
function getRandomUserAgent() {
|
||||
return userAgent.random().toString();
|
||||
}
|
||||
|
||||
function defaultOptionsWithProxy() {
|
||||
if (process.env.PROXY_HOST && process.env.PROXY_TYPE) {
|
||||
return {
|
||||
proxy: process.env.PROXY_HOST,
|
||||
headers: {
|
||||
'user-agent': getRandomUserAgent(),
|
||||
'proxy-type': process.env.PROXY_TYPE
|
||||
}
|
||||
}
|
||||
}
|
||||
return { userAgent: getRandomUserAgent() };
|
||||
}
|
||||
|
||||
module.exports = { getRandomUserAgent, defaultOptionsWithProxy };
|
||||
208
scraper/lib/torrent.js
Normal file
208
scraper/lib/torrent.js
Normal file
@@ -0,0 +1,208 @@
|
||||
const torrentStream = require('torrent-stream');
|
||||
const needle = require('needle');
|
||||
const parseTorrent = require('parse-torrent');
|
||||
const BTClient = require('bittorrent-tracker')
|
||||
const async = require('async');
|
||||
const decode = require('magnet-uri');
|
||||
const { Type } = require('./types');
|
||||
const { delay } = require('./promises')
|
||||
const { isVideo, isSubtitle } = require('./extension');
|
||||
const { cacheTrackers } = require('./cache');
|
||||
|
||||
const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_all.txt';
|
||||
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
|
||||
const SEEDS_CHECK_TIMEOUT = 15 * 1000; // 15 secs
|
||||
const ADDITIONAL_TRACKERS = [
|
||||
'http://tracker.trackerfix.com:80/announce',
|
||||
'udp://9.rarbg.me:2780',
|
||||
'udp://9.rarbg.to:2870'
|
||||
];
|
||||
const ANIME_TRACKERS = [
|
||||
"http://nyaa.tracker.wf:7777/announce",
|
||||
"udp://anidex.moe:6969/announce",
|
||||
"udp://tracker-udp.anirena.com:80/announce",
|
||||
"udp://tracker.uw0.xyz:6969/announce"
|
||||
];
|
||||
|
||||
async function updateCurrentSeeders(torrentsInput) {
|
||||
return new Promise(async (resolve) => {
|
||||
const torrents = Array.isArray(torrentsInput) ? torrentsInput : [torrentsInput];
|
||||
const perTorrentResults = Object.fromEntries(new Map(torrents.map(torrent => [torrent.infoHash, {}])));
|
||||
const perTrackerInfoHashes = await Promise.all(torrents.map(torrent => getTorrentTrackers(torrent)
|
||||
.then(torrentTrackers => ({ infoHash: torrent.infoHash, trackers: torrentTrackers }))))
|
||||
.then(allTorrentTrackers => allTorrentTrackers
|
||||
.reduce((allTrackersMap, torrentTrackers) => {
|
||||
torrentTrackers.trackers.forEach(tracker =>
|
||||
allTrackersMap[tracker] = (allTrackersMap[tracker] || []).concat(torrentTrackers.infoHash));
|
||||
return allTrackersMap;
|
||||
}, {}));
|
||||
let successCounter = 0;
|
||||
const callback = () => {
|
||||
console.log(`Total successful tracker responses: ${successCounter}`)
|
||||
resolve(perTorrentResults);
|
||||
}
|
||||
setTimeout(callback, SEEDS_CHECK_TIMEOUT);
|
||||
|
||||
async.each(Object.keys(perTrackerInfoHashes), function (tracker, ready) {
|
||||
BTClient.scrape({ infoHash: perTrackerInfoHashes[tracker], announce: tracker }, (error, response) => {
|
||||
if (response) {
|
||||
const results = Array.isArray(torrentsInput) ? Object.entries(response) : [[response.infoHash, response]];
|
||||
results
|
||||
.filter(([infoHash]) => perTorrentResults[infoHash])
|
||||
.forEach(([infoHash, seeders]) =>
|
||||
perTorrentResults[infoHash][tracker] = [seeders.complete, seeders.incomplete])
|
||||
successCounter++;
|
||||
} else if (error) {
|
||||
perTrackerInfoHashes[tracker]
|
||||
.filter(infoHash => perTorrentResults[infoHash])
|
||||
.forEach(infoHash => perTorrentResults[infoHash][tracker] = [0, 0, error.message])
|
||||
}
|
||||
ready();
|
||||
})
|
||||
}, callback);
|
||||
}).then(perTorrentResults => {
|
||||
const torrents = Array.isArray(torrentsInput) ? torrentsInput : [torrentsInput];
|
||||
torrents.forEach(torrent => {
|
||||
const results = perTorrentResults[torrent.infoHash];
|
||||
const newSeeders = Math.max(...Object.values(results).map(values => values[0]).concat(0));
|
||||
if (torrent.seeders !== newSeeders) {
|
||||
console.log(`Updating seeders for [${torrent.infoHash}] ${torrent.title} - ${torrent.seeders} -> ${newSeeders}`)
|
||||
torrent.seeders = newSeeders;
|
||||
}
|
||||
})
|
||||
return torrentsInput;
|
||||
});
|
||||
}
|
||||
|
||||
async function updateTorrentSize(torrent) {
|
||||
return filesAndSizeFromTorrentStream(torrent, SEEDS_CHECK_TIMEOUT)
|
||||
.then(result => {
|
||||
torrent.size = result.size;
|
||||
torrent.files = result.files;
|
||||
return torrent;
|
||||
});
|
||||
}
|
||||
|
||||
async function sizeAndFiles(torrent) {
|
||||
return filesAndSizeFromTorrentStream(torrent, 30000);
|
||||
}
|
||||
|
||||
async function torrentFiles(torrent, timeout) {
|
||||
return getFilesFromObject(torrent)
|
||||
.catch(() => filesFromTorrentFile(torrent))
|
||||
.catch(() => filesFromTorrentStream(torrent, timeout))
|
||||
.then(files => ({
|
||||
contents: files,
|
||||
videos: filterVideos(files),
|
||||
subtitles: filterSubtitles(files)
|
||||
}));
|
||||
}
|
||||
|
||||
function getFilesFromObject(torrent) {
|
||||
if (Array.isArray(torrent.files)) {
|
||||
return Promise.resolve(torrent.files);
|
||||
}
|
||||
if (typeof torrent.files === 'function') {
|
||||
return torrent.files();
|
||||
}
|
||||
return Promise.reject("No files in the object");
|
||||
}
|
||||
|
||||
async function filesFromTorrentFile(torrent) {
|
||||
if (!torrent.torrentLink) {
|
||||
return Promise.reject(new Error("no torrentLink"));
|
||||
}
|
||||
|
||||
return needle('get', torrent.torrentLink, { open_timeout: 10000 })
|
||||
.then((response) => {
|
||||
if (!response.body || response.statusCode !== 200) {
|
||||
throw new Error('torrent not found')
|
||||
}
|
||||
return response.body
|
||||
})
|
||||
.then((body) => parseTorrent(body))
|
||||
.then((info) => info.files.map((file, fileId) => ({
|
||||
fileIndex: fileId,
|
||||
name: file.name,
|
||||
path: file.path.replace(/^[^\/]+\//, ''),
|
||||
size: file.length
|
||||
})));
|
||||
}
|
||||
|
||||
async function filesFromTorrentStream(torrent, timeout) {
|
||||
return filesAndSizeFromTorrentStream(torrent, timeout).then(result => result.files);
|
||||
}
|
||||
|
||||
function filesAndSizeFromTorrentStream(torrent, timeout = 30000) {
|
||||
if (!torrent.infoHash && !torrent.magnetLink) {
|
||||
return Promise.reject(new Error("no infoHash or magnetLink"));
|
||||
}
|
||||
const magnet = torrent.magnetLink || decode.encode({ infoHash: torrent.infoHash, announce: torrent.trackers });
|
||||
return new Promise((resolve, rejected) => {
|
||||
const engine = new torrentStream(magnet, { connections: MAX_PEER_CONNECTIONS });
|
||||
|
||||
engine.ready(() => {
|
||||
const files = engine.files
|
||||
.map((file, fileId) => ({
|
||||
fileIndex: fileId,
|
||||
name: file.name,
|
||||
path: file.path.replace(/^[^\/]+\//, ''),
|
||||
size: file.length
|
||||
}));
|
||||
const size = engine.torrent.length;
|
||||
|
||||
engine.destroy();
|
||||
resolve({ files, size });
|
||||
});
|
||||
setTimeout(() => {
|
||||
engine.destroy();
|
||||
rejected(new Error('No available connections for torrent!'));
|
||||
}, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
function filterVideos(files) {
|
||||
if (files.length === 1 && !Number.isInteger(files[0].fileIndex)) {
|
||||
return files;
|
||||
}
|
||||
const videos = files.filter(file => isVideo(file.path));
|
||||
const maxSize = Math.max(...videos.map(video => video.size));
|
||||
const minSampleRatio = videos.length <= 3 ? 3 : 10;
|
||||
const minAnimeExtraRatio = 5;
|
||||
const minRedundantRatio = videos.length <= 3 ? 30 : Number.MAX_VALUE;
|
||||
const isSample = video => video.path.match(/sample|bonus|promo/i) && maxSize / parseInt(video.size) > minSampleRatio;
|
||||
const isRedundant = video => maxSize / parseInt(video.size) > minRedundantRatio;
|
||||
const isExtra = video => video.path.match(/extras?\//i);
|
||||
const isAnimeExtra = video => video.path.match(/(?:\b|_)(?:NC)?(?:ED|OP|PV)(?:v?\d\d?)?(?:\b|_)/i)
|
||||
&& maxSize / parseInt(video.size) > minAnimeExtraRatio;
|
||||
const isWatermark = video => video.path.match(/^[A-Z-]+(?:\.[A-Z]+)?\.\w{3,4}$/)
|
||||
&& maxSize / parseInt(video.size) > minAnimeExtraRatio
|
||||
return videos
|
||||
.filter(video => !isSample(video))
|
||||
.filter(video => !isExtra(video))
|
||||
.filter(video => !isAnimeExtra(video))
|
||||
.filter(video => !isRedundant(video))
|
||||
.filter(video => !isWatermark(video));
|
||||
}
|
||||
|
||||
function filterSubtitles(files) {
|
||||
return files.filter(file => isSubtitle(file.path));
|
||||
}
|
||||
|
||||
async function getTorrentTrackers(torrent) {
|
||||
const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr || [];
|
||||
const torrentTrackers = torrent.trackers && torrent.trackers.split(',') || [];
|
||||
const defaultTrackers = await getDefaultTrackers(torrent);
|
||||
return Array.from(new Set([].concat(magnetTrackers).concat(torrentTrackers).concat(defaultTrackers)));
|
||||
}
|
||||
|
||||
async function getDefaultTrackers(torrent, retry = 3) {
|
||||
return cacheTrackers(() => needle('get', TRACKERS_URL, { open_timeout: SEEDS_CHECK_TIMEOUT })
|
||||
.then(response => response.body && response.body.trim())
|
||||
.then(body => body && body.split('\n\n') || []))
|
||||
.catch(() => retry > 0 ? delay(5000).then(() => getDefaultTrackers(torrent, retry - 1)) : [])
|
||||
.then(trackers => trackers.concat(ADDITIONAL_TRACKERS))
|
||||
.then(trackers => torrent.type === Type.ANIME ? trackers.concat(ANIME_TRACKERS) : trackers);
|
||||
}
|
||||
|
||||
module.exports = { updateCurrentSeeders, updateTorrentSize, sizeAndFiles, torrentFiles }
|
||||
182
scraper/lib/torrentEntries.js
Normal file
182
scraper/lib/torrentEntries.js
Normal file
@@ -0,0 +1,182 @@
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const { Type } = require('./types');
|
||||
const Promises = require('./promises');
|
||||
const repository = require('./repository');
|
||||
const { getImdbId, getKitsuId } = require('./metadata');
|
||||
const { parseTorrentFiles } = require('./torrentFiles');
|
||||
const { assignSubtitles } = require('./torrentSubtitles');
|
||||
const { isPackTorrent } = require('./parseHelper')
|
||||
|
||||
async function createTorrentEntry(torrent, overwrite = false) {
|
||||
const titleInfo = parse(torrent.title);
|
||||
|
||||
if ((titleInfo.seasons || torrent.title.match(/\[\d+-\d+/)) && torrent.type === Type.MOVIE) {
|
||||
// sometimes series torrent might be put into movies category
|
||||
torrent.type = Type.SERIES;
|
||||
}
|
||||
if (!torrent.imdbId && torrent.type !== Type.ANIME) {
|
||||
torrent.imdbId = await getImdbId(titleInfo, torrent.type)
|
||||
.catch(() => undefined);
|
||||
}
|
||||
if (torrent.imdbId && torrent.imdbId.length < 9) {
|
||||
// pad zeros to imdbId if missing
|
||||
torrent.imdbId = 'tt' + torrent.imdbId.replace('tt', '').padStart(7, '0');
|
||||
}
|
||||
if (torrent.imdbId && torrent.imdbId.length > 9 && torrent.imdbId.startsWith('tt0')) {
|
||||
// sanitize imdbId from redundant zeros
|
||||
torrent.imdbId = torrent.imdbId.replace(/tt0+([0-9]{7,})$/, 'tt$1');
|
||||
}
|
||||
if (!torrent.kitsuId && torrent.type === Type.ANIME) {
|
||||
torrent.kitsuId = await getKitsuId(titleInfo)
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
if (!torrent.imdbId && !torrent.kitsuId && !isPackTorrent(torrent)) {
|
||||
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const { contents, videos, subtitles } = await parseTorrentFiles(torrent)
|
||||
.then(torrentContents => overwrite ? overwriteExistingFiles(torrent, torrentContents) : torrentContents)
|
||||
.then(torrentContents => assignSubtitles(torrentContents))
|
||||
.catch(error => {
|
||||
console.log(`Failed getting files for ${torrent.title}`, error.message);
|
||||
return {};
|
||||
});
|
||||
if (!videos || !videos.length) {
|
||||
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
|
||||
return;
|
||||
}
|
||||
|
||||
return repository.createTorrent({ ...torrent, contents, subtitles })
|
||||
.then(() => Promises.sequence(videos.map(video => () => repository.createFile(video))))
|
||||
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
|
||||
}
|
||||
|
||||
async function overwriteExistingFiles(torrent, torrentContents) {
|
||||
const videos = torrentContents && torrentContents.videos;
|
||||
if (videos && videos.length) {
|
||||
const existingFiles = await repository.getFiles({ infoHash: videos[0].infoHash })
|
||||
.then((existing) => existing
|
||||
.reduce((map, next) => {
|
||||
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
|
||||
map[fileIndex] = (map[fileIndex] || []).concat(next);
|
||||
return map;
|
||||
}, {}))
|
||||
.catch(() => undefined);
|
||||
if (existingFiles && Object.keys(existingFiles).length) {
|
||||
const overwrittenVideos = videos
|
||||
.map(file => {
|
||||
const mapping = videos.length === 1 && Object.keys(existingFiles).length === 1
|
||||
? Object.values(existingFiles)[0]
|
||||
: existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
|
||||
if (mapping) {
|
||||
const originalFile = mapping.shift();
|
||||
return { id: originalFile.id, ...file };
|
||||
}
|
||||
return file;
|
||||
});
|
||||
return { ...torrentContents, videos: overwrittenVideos };
|
||||
}
|
||||
return torrentContents;
|
||||
}
|
||||
return Promise.reject(`No video files found for: ${torrent.title}`);
|
||||
}
|
||||
|
||||
async function createSkipTorrentEntry(torrent) {
|
||||
return repository.createSkipTorrent(torrent);
|
||||
}
|
||||
|
||||
async function getStoredTorrentEntry(torrent) {
|
||||
return repository.getSkipTorrent(torrent)
|
||||
.catch(() => repository.getTorrent(torrent))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function checkAndUpdateTorrent(torrent) {
|
||||
const storedTorrent = torrent.dataValues
|
||||
? torrent
|
||||
: await repository.getTorrent(torrent).catch(() => undefined);
|
||||
if (!storedTorrent) {
|
||||
return false;
|
||||
}
|
||||
if (storedTorrent.provider === 'KickassTorrents' && torrent.provider) {
|
||||
storedTorrent.provider = torrent.provider;
|
||||
storedTorrent.torrentId = torrent.torrentId;
|
||||
}
|
||||
if (!storedTorrent.languages && torrent.languages && storedTorrent.provider !== 'RARBG') {
|
||||
storedTorrent.languages = torrent.languages;
|
||||
storedTorrent.save();
|
||||
console.log(`Updated [${torrent.infoHash}] ${torrent.title} language to ${torrent.languages}`);
|
||||
}
|
||||
return createTorrentContents({ ...storedTorrent.get(), torrentLink: torrent.torrentLink })
|
||||
.then(() => updateTorrentSeeders(torrent));
|
||||
}
|
||||
|
||||
async function createTorrentContents(torrent) {
|
||||
if (torrent.opened) {
|
||||
return;
|
||||
}
|
||||
const storedVideos = await repository.getFiles(torrent).catch(() => []);
|
||||
if (!storedVideos || !storedVideos.length) {
|
||||
return;
|
||||
}
|
||||
const notOpenedVideo = storedVideos.length === 1 && !Number.isInteger(storedVideos[0].fileIndex);
|
||||
const imdbId = Promises.mostCommonValue(storedVideos.map(stored => stored.imdbId));
|
||||
const kitsuId = Promises.mostCommonValue(storedVideos.map(stored => stored.kitsuId));
|
||||
|
||||
const { contents, videos, subtitles } = await parseTorrentFiles({ ...torrent, imdbId, kitsuId })
|
||||
.then(torrentContents => notOpenedVideo ? torrentContents : { ...torrentContents, videos: storedVideos })
|
||||
.then(torrentContents => assignSubtitles(torrentContents))
|
||||
.catch(error => {
|
||||
console.log(`Failed getting contents for [${torrent.infoHash}] ${torrent.title}`, error.message);
|
||||
return {};
|
||||
});
|
||||
|
||||
if (!contents || !contents.length) {
|
||||
return;
|
||||
}
|
||||
if (notOpenedVideo && videos.length === 1) {
|
||||
// if both have a single video and stored one was not opened, update stored one to true metadata and use that
|
||||
storedVideos[0].fileIndex = videos[0].fileIndex;
|
||||
storedVideos[0].title = videos[0].title;
|
||||
storedVideos[0].size = videos[0].size;
|
||||
storedVideos[0].subtitles = videos[0].subtitles;
|
||||
videos[0] = storedVideos[0];
|
||||
}
|
||||
// no videos available or more than one new videos were in the torrent
|
||||
const shouldDeleteOld = notOpenedVideo && videos.every(video => !video.id);
|
||||
|
||||
return repository.createTorrent({ ...torrent, contents, subtitles })
|
||||
.then(() => {
|
||||
if (shouldDeleteOld) {
|
||||
console.error(`Deleting old video for [${torrent.infoHash}] ${torrent.title}`)
|
||||
return storedVideos[0].destroy();
|
||||
}
|
||||
return Promise.resolve();
|
||||
})
|
||||
.then(() => Promises.sequence(videos.map(video => () => repository.createFile(video))))
|
||||
.then(() => console.log(`Created contents for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`))
|
||||
.catch(error => console.error(`Failed saving contents for [${torrent.infoHash}] ${torrent.title}`, error));
|
||||
}
|
||||
|
||||
async function updateTorrentSeeders(torrent) {
|
||||
if (!(torrent.infoHash || (torrent.provider && torrent.torrentId)) || !Number.isInteger(torrent.seeders)) {
|
||||
return torrent;
|
||||
}
|
||||
|
||||
return repository.setTorrentSeeders(torrent, torrent.seeders)
|
||||
.catch(error => {
|
||||
console.warn('Failed updating seeders:', error);
|
||||
return undefined;
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
createTorrentEntry,
|
||||
createTorrentContents,
|
||||
createSkipTorrentEntry,
|
||||
getStoredTorrentEntry,
|
||||
updateTorrentSeeders,
|
||||
checkAndUpdateTorrent
|
||||
};
|
||||
506
scraper/lib/torrentFiles.js
Normal file
506
scraper/lib/torrentFiles.js
Normal file
@@ -0,0 +1,506 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const distance = require('jaro-winkler');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const Promises = require('../lib/promises');
|
||||
const { torrentFiles } = require('../lib/torrent');
|
||||
const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata');
|
||||
const { parseSeriesVideos, isPackTorrent } = require('../lib/parseHelper');
|
||||
const { Type } = require('./types');
|
||||
const { isDisk } = require('./extension');
|
||||
|
||||
const MIN_SIZE = 5 * 1024 * 1024; // 5 MB
|
||||
const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 });
|
||||
|
||||
async function parseTorrentFiles(torrent) {
|
||||
const parsedTorrentName = parse(torrent.title);
|
||||
const metadata = await getMetadata(torrent.kitsuId || torrent.imdbId, torrent.type || Type.MOVIE)
|
||||
.then(meta => Object.assign({}, meta))
|
||||
.catch(() => undefined);
|
||||
|
||||
// if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) {
|
||||
// throw new Error(`Mismatching entry type for ${torrent.name}: ${torrent.type}!=${metadata.type}`);
|
||||
// }
|
||||
if (torrent.type !== Type.ANIME && metadata && metadata.type && metadata.type !== torrent.type) {
|
||||
// it's actually a movie/series
|
||||
torrent.type = metadata.type;
|
||||
}
|
||||
|
||||
if (torrent.type === Type.MOVIE && (!parsedTorrentName.seasons ||
|
||||
parsedTorrentName.season === 5 && [1, 5].includes(parsedTorrentName.episode))) {
|
||||
return parseMovieFiles(torrent, parsedTorrentName, metadata);
|
||||
}
|
||||
|
||||
return parseSeriesFiles(torrent, parsedTorrentName, metadata)
|
||||
}
|
||||
|
||||
async function parseMovieFiles(torrent, parsedName, metadata) {
|
||||
const { contents, videos, subtitles } = await getMoviesTorrentContent(torrent);
|
||||
const filteredVideos = videos
|
||||
.filter(video => video.size > MIN_SIZE)
|
||||
.filter(video => !isFeaturette(video));
|
||||
if (isSingleMovie(filteredVideos)) {
|
||||
const parsedVideos = filteredVideos.map(video => ({
|
||||
infoHash: torrent.infoHash,
|
||||
fileIndex: video.fileIndex,
|
||||
title: video.path || torrent.title,
|
||||
size: video.size || torrent.size,
|
||||
imdbId: torrent.imdbId || metadata && metadata.imdbId,
|
||||
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
|
||||
}));
|
||||
return { contents, videos: parsedVideos, subtitles };
|
||||
}
|
||||
|
||||
const parsedVideos = await Promises.sequence(filteredVideos.map(video => () => isFeaturette(video)
|
||||
? Promise.resolve(video)
|
||||
: findMovieImdbId(video.name).then(imdbId => ({ ...video, imdbId }))))
|
||||
.then(videos => videos.map(video => ({
|
||||
infoHash: torrent.infoHash,
|
||||
fileIndex: video.fileIndex,
|
||||
title: video.path || video.name,
|
||||
size: video.size,
|
||||
imdbId: video.imdbId,
|
||||
})));
|
||||
return { contents, videos: parsedVideos, subtitles };
|
||||
}
|
||||
|
||||
async function parseSeriesFiles(torrent, parsedName, metadata) {
|
||||
const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent);
|
||||
const parsedVideos = await Promise.resolve(videos)
|
||||
.then(videos => videos.filter(video => videos.length === 1 || video.size > MIN_SIZE))
|
||||
.then(videos => parseSeriesVideos(torrent, videos))
|
||||
.then(videos => decomposeEpisodes(torrent, videos, metadata))
|
||||
.then(videos => assignKitsuOrImdbEpisodes(torrent, videos, metadata))
|
||||
.then(videos => Promise.all(videos.map(video => video.isMovie
|
||||
? mapSeriesMovie(video, torrent)
|
||||
: mapSeriesEpisode(video, torrent, videos))))
|
||||
.then(videos => videos
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
.map(video => isFeaturette(video) ? clearInfoFields(video) : video))
|
||||
return { contents, videos: parsedVideos, subtitles };
|
||||
}
|
||||
|
||||
async function getMoviesTorrentContent(torrent) {
|
||||
const files = await torrentFiles(torrent)
|
||||
.catch(error => {
|
||||
if (!isPackTorrent(torrent)) {
|
||||
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
|
||||
}
|
||||
return Promise.reject(error);
|
||||
});
|
||||
if (files.contents && files.contents.length && !files.videos.length && isDiskTorrent(files.contents)) {
|
||||
files.videos = [{ name: torrent.title, path: torrent.title, size: torrent.size }];
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
async function getSeriesTorrentContent(torrent) {
|
||||
return torrentFiles(torrent)
|
||||
.catch(error => {
|
||||
if (!isPackTorrent(torrent)) {
|
||||
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
|
||||
}
|
||||
return Promise.reject(error);
|
||||
});
|
||||
}
|
||||
|
||||
async function mapSeriesEpisode(file, torrent, files) {
|
||||
if (!file.episodes && !file.kitsuEpisodes) {
|
||||
if (files.some(otherFile => otherFile.episodes || otherFile.kitsuEpisodes) || parse(torrent.title).seasons) {
|
||||
return Promise.resolve({
|
||||
infoHash: torrent.infoHash,
|
||||
fileIndex: file.fileIndex,
|
||||
title: file.path || file.name,
|
||||
size: file.size,
|
||||
imdbId: torrent.imdbId || file.imdbId,
|
||||
});
|
||||
}
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
const episodeIndexes = [...(file.episodes || file.kitsuEpisodes).keys()];
|
||||
return Promise.resolve(episodeIndexes.map((index) => ({
|
||||
infoHash: torrent.infoHash,
|
||||
fileIndex: file.fileIndex,
|
||||
title: file.path || file.name,
|
||||
size: file.size,
|
||||
imdbId: file.imdbId || torrent.imdbId,
|
||||
imdbSeason: file.season,
|
||||
imdbEpisode: file.episodes && file.episodes[index],
|
||||
kitsuId: file.kitsuId || torrent.kitsuId,
|
||||
kitsuEpisode: file.kitsuEpisodes && file.kitsuEpisodes[index]
|
||||
})))
|
||||
}
|
||||
|
||||
async function mapSeriesMovie(file, torrent) {
|
||||
const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined;
|
||||
const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined;
|
||||
const metadata = await getMetadata(kitsuId || imdbId, Type.MOVIE).catch(() => ({}));
|
||||
const hasEpisode = metadata.videos && metadata.videos.length && (file.episode || metadata.videos.length === 1);
|
||||
const episodeVideo = hasEpisode && metadata.videos[(file.episode || 1) - 1];
|
||||
return [{
|
||||
infoHash: torrent.infoHash,
|
||||
fileIndex: file.fileIndex,
|
||||
title: file.path || file.name,
|
||||
size: file.size,
|
||||
imdbId: metadata.imdbId || imdbId,
|
||||
kitsuId: metadata.kitsuId || kitsuId,
|
||||
imdbSeason: episodeVideo && metadata.imdbId ? episodeVideo.imdbSeason : undefined,
|
||||
imdbEpisode: episodeVideo && metadata.imdbId ? episodeVideo.imdbEpisode || episodeVideo.episode : undefined,
|
||||
kitsuEpisode: episodeVideo && metadata.kitsuId ? episodeVideo.kitsuEpisode || episodeVideo.episode : undefined
|
||||
}];
|
||||
}
|
||||
|
||||
async function decomposeEpisodes(torrent, files, metadata = { episodeCount: [] }) {
|
||||
if (files.every(file => !file.episodes && !file.date)) {
|
||||
return files;
|
||||
}
|
||||
|
||||
preprocessEpisodes(files);
|
||||
|
||||
if (torrent.type === Type.ANIME && torrent.kitsuId) {
|
||||
if (needsCinemetaMetadataForAnime(files, metadata)) {
|
||||
// In some cases anime could be resolved to wrong kitsuId
|
||||
// because of imdb season naming/absolute per series naming/multiple seasons
|
||||
// So in these cases we need to fetch cinemeta based metadata and decompose episodes using that
|
||||
await updateToCinemetaMetadata(metadata);
|
||||
if (files.some(file => Number.isInteger(file.season))) {
|
||||
// sometimes multi season anime torrents don't include season 1 naming
|
||||
files
|
||||
.filter(file => !Number.isInteger(file.season) && file.episodes)
|
||||
.forEach(file => file.season = 1);
|
||||
}
|
||||
} else {
|
||||
// otherwise for anime type episodes are always absolute and for a single season
|
||||
files
|
||||
.filter(file => file.episodes && file.season !== 0)
|
||||
.forEach(file => file.season = 1);
|
||||
return files;
|
||||
}
|
||||
}
|
||||
|
||||
const sortedEpisodes = files
|
||||
.map(file => !file.isMovie && file.episodes || [])
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
.sort((a, b) => a - b);
|
||||
|
||||
if (isConcatSeasonAndEpisodeFiles(files, sortedEpisodes, metadata)) {
|
||||
decomposeConcatSeasonAndEpisodeFiles(torrent, files, metadata);
|
||||
} else if (isDateEpisodeFiles(files, metadata)) {
|
||||
decomposeDateEpisodeFiles(torrent, files, metadata);
|
||||
} else if (isAbsoluteEpisodeFiles(files, metadata)) {
|
||||
decomposeAbsoluteEpisodeFiles(torrent, files, metadata);
|
||||
}
|
||||
// decomposeEpisodeTitleFiles(torrent, files, metadata);
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
function preprocessEpisodes(files) {
|
||||
// reverse special episode naming when they named with 0 episode, ie. S02E00
|
||||
files
|
||||
.filter(file => Number.isInteger(file.season) && file.episode === 0)
|
||||
.forEach(file => {
|
||||
file.episode = file.season
|
||||
file.episodes = [file.season]
|
||||
file.season = 0;
|
||||
})
|
||||
}
|
||||
|
||||
function isConcatSeasonAndEpisodeFiles(files, sortedEpisodes, metadata) {
|
||||
if (metadata.kitsuId !== undefined) {
|
||||
// anime does not use this naming scheme in 99% of cases;
|
||||
return false;
|
||||
}
|
||||
// decompose concat season and episode files (ex. 101=S01E01) in case:
|
||||
// 1. file has a season, but individual files are concatenated with that season (ex. path Season 5/511 - Prize
|
||||
// Fighters.avi)
|
||||
// 2. file does not have a season and the episode does not go out of range for the concat season
|
||||
// episode count
|
||||
const thresholdAbove = Math.max(Math.ceil(files.length * 0.05), 5);
|
||||
const thresholdSorted = Math.max(Math.ceil(files.length * 0.8), 8);
|
||||
const threshold = Math.max(Math.ceil(files.length * 0.8), 5);
|
||||
const sortedConcatEpisodes = sortedEpisodes
|
||||
.filter(ep => ep > 100)
|
||||
.filter(ep => metadata.episodeCount[div100(ep) - 1] < ep)
|
||||
.filter(ep => metadata.episodeCount[div100(ep) - 1] >= mod100(ep));
|
||||
const concatFileEpisodes = files
|
||||
.filter(file => !file.isMovie && file.episodes)
|
||||
.filter(file => !file.season || file.episodes.every(ep => div100(ep) === file.season));
|
||||
const concatAboveTotalEpisodeCount = files
|
||||
.filter(file => !file.isMovie && file.episodes && file.episodes.every(ep => ep > 100))
|
||||
.filter(file => file.episodes.every(ep => ep > metadata.totalCount));
|
||||
return sortedConcatEpisodes.length >= thresholdSorted && concatFileEpisodes.length >= threshold
|
||||
|| concatAboveTotalEpisodeCount.length >= thresholdAbove;
|
||||
}
|
||||
|
||||
function isDateEpisodeFiles(files, metadata) {
|
||||
return files.every(file => (!file.season || !metadata.episodeCount[file.season - 1]) && file.date);
|
||||
}
|
||||
|
||||
function isAbsoluteEpisodeFiles(files, metadata) {
|
||||
const threshold = Math.ceil(files.length / 5);
|
||||
const nonMovieEpisodes = files
|
||||
.filter(file => !file.isMovie && file.episodes);
|
||||
const absoluteEpisodes = files
|
||||
.filter(file => file.season && file.episodes)
|
||||
.filter(file => file.episodes.every(ep => metadata.episodeCount[file.season - 1] < ep))
|
||||
return nonMovieEpisodes.every(file => !file.season || file.season > metadata.episodeCount.length)
|
||||
|| absoluteEpisodes.length >= threshold
|
||||
// && !isNewEpisodesNotInMetadata(files, metadata);
|
||||
}
|
||||
|
||||
function isNewEpisodesNotInMetadata(files, metadata) {
|
||||
// new episode might not yet been indexed by cinemeta.
|
||||
// detect this if episode number is larger than the last episode or season is larger than the last one
|
||||
return files.length === 1
|
||||
&& /continuing|current/i.test(metadata.status)
|
||||
&& files.filter(file => !file.isMovie && file.episodes)
|
||||
.every(file => file.season >= metadata.episodeCount.length
|
||||
&& file.episodes.every(ep => ep > metadata.episodeCount[file.season - 1]))
|
||||
}
|
||||
|
||||
function decomposeConcatSeasonAndEpisodeFiles(torrent, files, metadata) {
|
||||
files
|
||||
.filter(file => file.episodes && file.season !== 0 && file.episodes.every(ep => ep > 100))
|
||||
.filter(file => metadata.episodeCount[(file.season || div100(file.episodes[0])) - 1] < 100)
|
||||
.filter(file => file.season && file.episodes.every(ep => div100(ep) === file.season) || !file.season)
|
||||
.forEach(file => {
|
||||
file.season = div100(file.episodes[0]);
|
||||
file.episodes = file.episodes.map(ep => mod100(ep))
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
function decomposeAbsoluteEpisodeFiles(torrent, files, metadata) {
|
||||
if (metadata.episodeCount.length === 0) {
|
||||
files
|
||||
.filter(file => !Number.isInteger(file.season) && file.episodes && !file.isMovie)
|
||||
.forEach(file => {
|
||||
file.season = 1;
|
||||
});
|
||||
return;
|
||||
}
|
||||
files
|
||||
.filter(file => file.episodes && !file.isMovie && file.season !== 0)
|
||||
.filter(file => !file.season || (metadata.episodeCount[file.season - 1] || 0) < file.episodes[0])
|
||||
.forEach(file => {
|
||||
const seasonIdx = ([...metadata.episodeCount.keys()]
|
||||
.find((i) => metadata.episodeCount.slice(0, i + 1).reduce((a, b) => a + b) >= file.episodes[0])
|
||||
+ 1 || metadata.episodeCount.length) - 1;
|
||||
|
||||
file.season = seasonIdx + 1;
|
||||
file.episodes = file.episodes
|
||||
.map(ep => ep - metadata.episodeCount.slice(0, seasonIdx).reduce((a, b) => a + b, 0))
|
||||
});
|
||||
}
|
||||
|
||||
function decomposeDateEpisodeFiles(torrent, files, metadata) {
|
||||
if (!metadata || !metadata.videos || !metadata.videos.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const timeZoneOffset = getTimeZoneOffset(metadata.country);
|
||||
const offsetVideos = metadata.videos
|
||||
.reduce((map, video) => {
|
||||
const releaseDate = moment(video.released).utcOffset(timeZoneOffset).format('YYYY-MM-DD');
|
||||
map[releaseDate] = video;
|
||||
return map;
|
||||
}, {});
|
||||
|
||||
files
|
||||
.filter(file => file.date)
|
||||
.forEach(file => {
|
||||
const video = offsetVideos[file.date];
|
||||
if (video) {
|
||||
file.season = video.season;
|
||||
file.episodes = [video.episode];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function decomposeEpisodeTitleFiles(torrent, files, metadata) {
|
||||
files
|
||||
// .filter(file => !file.season)
|
||||
.map(file => {
|
||||
const episodeTitle = file.name.replace('_', ' ')
|
||||
.replace(/^.*(?:E\d+[abc]?|- )\s?(.+)\.\w{1,4}$/, '$1')
|
||||
.trim();
|
||||
const foundEpisode = metadata.videos
|
||||
.map(video => ({ ...video, distance: distance(episodeTitle, video.name) }))
|
||||
.sort((a, b) => b.distance - a.distance)[0];
|
||||
if (foundEpisode) {
|
||||
file.isMovie = false;
|
||||
file.season = foundEpisode.season;
|
||||
file.episodes = [foundEpisode.episode];
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
function getTimeZoneOffset(country) {
|
||||
switch (country) {
|
||||
case 'United States':
|
||||
case 'USA':
|
||||
return '-08:00';
|
||||
default:
|
||||
return '00:00';
|
||||
}
|
||||
}
|
||||
|
||||
function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
|
||||
if (!metadata || !metadata.videos || !metadata.videos.length) {
|
||||
if (torrent.type === Type.ANIME) {
|
||||
// assign episodes as kitsu episodes for anime when no metadata available for imdb mapping
|
||||
files
|
||||
.filter(file => file.season && file.episodes)
|
||||
.forEach(file => {
|
||||
file.kitsuEpisodes = file.episodes;
|
||||
file.season = undefined;
|
||||
file.episodes = undefined;
|
||||
})
|
||||
if (metadata.type === Type.MOVIE && files.every(file => !file.imdbId)) {
|
||||
// sometimes a movie has episode naming, thus not recognized as a movie and imdbId not assigned
|
||||
files.forEach(file => file.imdbId = metadata.imdbId);
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
const seriesMapping = metadata.videos
|
||||
.reduce((map, video) => {
|
||||
const episodeMap = map[video.season] || {};
|
||||
episodeMap[video.episode] = video;
|
||||
map[video.season] = episodeMap;
|
||||
return map;
|
||||
}, {});
|
||||
|
||||
if (metadata.videos.some(video => Number.isInteger(video.imdbSeason)) || !metadata.imdbId) {
|
||||
// kitsu episode info is the base
|
||||
files
|
||||
.filter(file => Number.isInteger(file.season) && file.episodes)
|
||||
.map(file => {
|
||||
const seasonMapping = seriesMapping[file.season];
|
||||
const episodeMapping = seasonMapping && seasonMapping[file.episodes[0]];
|
||||
file.kitsuEpisodes = file.episodes;
|
||||
if (episodeMapping && Number.isInteger(episodeMapping.imdbSeason)) {
|
||||
file.imdbId = metadata.imdbId;
|
||||
file.season = episodeMapping.imdbSeason;
|
||||
file.episodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].imdbEpisode);
|
||||
} else {
|
||||
// no imdb mapping available for episode
|
||||
file.season = undefined;
|
||||
file.episodes = undefined;
|
||||
}
|
||||
});
|
||||
} else if (metadata.videos.some(video => video.kitsuEpisode)) {
|
||||
// imdb episode info is base
|
||||
files
|
||||
.filter(file => Number.isInteger(file.season) && file.episodes)
|
||||
.forEach(file => {
|
||||
if (seriesMapping[file.season]) {
|
||||
const seasonMapping = seriesMapping[file.season];
|
||||
file.imdbId = metadata.imdbId;
|
||||
file.kitsuId = seasonMapping[file.episodes[0]] && seasonMapping[file.episodes[0]].kitsuId;
|
||||
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
|
||||
} else if (seriesMapping[file.season - 1]) {
|
||||
// sometimes a second season might be a continuation of the previous season
|
||||
const seasonMapping = seriesMapping[file.season - 1];
|
||||
const episodes = Object.values(seasonMapping);
|
||||
const firstKitsuId = episodes.length && episodes[0].kitsuId;
|
||||
const differentTitlesCount = new Set(episodes.map(ep => ep.kitsuId)).size
|
||||
const skippedCount = episodes.filter(ep => ep.kitsuId === firstKitsuId).length;
|
||||
const seasonEpisodes = files
|
||||
.filter(otherFile => otherFile.season === file.season)
|
||||
.reduce((a, b) => a.concat(b.episodes), []);
|
||||
const isAbsoluteOrder = seasonEpisodes.every(ep => ep > skippedCount && ep <= episodes.length)
|
||||
const isNormalOrder = seasonEpisodes.every(ep => ep + skippedCount <= episodes.length)
|
||||
if (differentTitlesCount >= 1 && (isAbsoluteOrder || isNormalOrder)) {
|
||||
file.imdbId = metadata.imdbId;
|
||||
file.season = file.season - 1;
|
||||
file.episodes = file.episodes.map(ep => isAbsoluteOrder ? ep : ep + skippedCount);
|
||||
file.kitsuId = seasonMapping[file.episodes[0]].kitsuId;
|
||||
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
|
||||
}
|
||||
} else if (Object.values(seriesMapping).length === 1 && seriesMapping[1]) {
|
||||
// sometimes series might be named with sequel season but it's not a season on imdb and a new title
|
||||
const seasonMapping = seriesMapping[1];
|
||||
file.imdbId = metadata.imdbId;
|
||||
file.season = 1;
|
||||
file.kitsuId = seasonMapping[file.episodes[0]].kitsuId;
|
||||
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
|
||||
}
|
||||
});
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
function needsCinemetaMetadataForAnime(files, metadata) {
|
||||
if (!metadata || !metadata.imdbId || !metadata.videos || !metadata.videos.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const minSeason = Math.min(...metadata.videos.map(video => video.imdbSeason)) || Number.MAX_VALUE;
|
||||
const maxSeason = Math.max(...metadata.videos.map(video => video.imdbSeason)) || Number.MAX_VALUE;
|
||||
const differentSeasons = new Set(metadata.videos
|
||||
.map(video => video.imdbSeason)
|
||||
.filter(season => Number.isInteger(season))).size;
|
||||
const total = metadata.totalCount || Number.MAX_VALUE;
|
||||
return differentSeasons > 1 || files
|
||||
.filter(file => !file.isMovie && file.episodes)
|
||||
.some(file => file.season < minSeason || file.season > maxSeason || file.episodes.every(ep => ep > total));
|
||||
}
|
||||
|
||||
async function updateToCinemetaMetadata(metadata) {
|
||||
return getMetadata(metadata.imdbId, metadata.type)
|
||||
.then(newMetadata => !newMetadata.videos || !newMetadata.videos.length ? metadata : newMetadata)
|
||||
.then(newMetadata => {
|
||||
metadata.videos = newMetadata.videos;
|
||||
metadata.episodeCount = newMetadata.episodeCount;
|
||||
metadata.totalCount = newMetadata.totalCount;
|
||||
return metadata;
|
||||
})
|
||||
.catch(error => console.warn(`Failed ${metadata.imdbId} metadata cinemeta update due: ${error.message}`));
|
||||
}
|
||||
|
||||
function findMovieImdbId(title) {
|
||||
const parsedTitle = typeof title === 'string' ? parse(title) : title;
|
||||
return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined));
|
||||
}
|
||||
|
||||
function findMovieKitsuId(title) {
|
||||
const parsedTitle = typeof title === 'string' ? parse(title) : title;
|
||||
return getKitsuId(parsedTitle, Type.MOVIE).catch(() => undefined);
|
||||
}
|
||||
|
||||
function isDiskTorrent(contents) {
|
||||
return contents.some(content => isDisk(content.path));
|
||||
}
|
||||
|
||||
function isSingleMovie(videos) {
|
||||
return videos.length === 1 ||
|
||||
(videos.length === 2 &&
|
||||
videos.find(v => /\b(?:part|disc|cd)[ ._-]?0?1\b|^0?1\.\w{2,4}$/i.test(v.path)) &&
|
||||
videos.find(v => /\b(?:part|disc|cd)[ ._-]?0?2\b|^0?2\.\w{2,4}$/i.test(v.path)));
|
||||
}
|
||||
|
||||
function isFeaturette(video) {
|
||||
return /featurettes?\/|extras-grym/i.test(video.path);
|
||||
}
|
||||
|
||||
function clearInfoFields(video) {
|
||||
video.imdbId = undefined;
|
||||
video.imdbSeason = undefined;
|
||||
video.imdbEpisode = undefined;
|
||||
video.kitsuId = undefined;
|
||||
video.kitsuEpisode = undefined;
|
||||
return video;
|
||||
}
|
||||
|
||||
function div100(episode) {
|
||||
return (episode / 100 >> 0); // floor to nearest int
|
||||
}
|
||||
|
||||
function mod100(episode) {
|
||||
return episode % 100;
|
||||
}
|
||||
|
||||
module.exports = { parseTorrentFiles };
|
||||
91
scraper/lib/torrentSubtitles.js
Normal file
91
scraper/lib/torrentSubtitles.js
Normal file
@@ -0,0 +1,91 @@
|
||||
const { parse } = require('parse-torrent-title');
|
||||
|
||||
function assignSubtitles({ contents, videos, subtitles }) {
|
||||
if (videos && videos.length && subtitles && subtitles.length) {
|
||||
if (videos.length === 1) {
|
||||
videos[0].subtitles = subtitles;
|
||||
return { contents, videos, subtitles: [] };
|
||||
}
|
||||
|
||||
const parsedVideos = videos
|
||||
.map(video => _parseVideo(video));
|
||||
const assignedSubs = subtitles
|
||||
.map(subtitle => ({ subtitle, videos: _mostProbableSubtitleVideos(subtitle, parsedVideos) }));
|
||||
const unassignedSubs = assignedSubs
|
||||
.filter(assignedSub => !assignedSub.videos)
|
||||
.map(assignedSub => assignedSub.subtitle);
|
||||
|
||||
assignedSubs
|
||||
.filter(assignedSub => assignedSub.videos)
|
||||
.forEach(assignedSub => assignedSub.videos
|
||||
.forEach(video => video.subtitles = (video.subtitles || []).concat(assignedSub.subtitle)));
|
||||
return { contents, videos, subtitles: unassignedSubs };
|
||||
}
|
||||
return { contents, videos, subtitles };
|
||||
}
|
||||
|
||||
function _parseVideo(video) {
|
||||
const fileName = video.title.split('/').pop().replace(/\.(\w{2,4})$/, '');
|
||||
const folderName = video.title.replace(/\/?[^/]+$/, '');
|
||||
return {
|
||||
videoFile: video,
|
||||
fileName: fileName,
|
||||
folderName: folderName,
|
||||
...parseFilename(video.title)
|
||||
};
|
||||
}
|
||||
|
||||
function _mostProbableSubtitleVideos(subtitle, parsedVideos) {
|
||||
const subTitle = (subtitle.title || subtitle.path).split('/').pop().replace(/\.(\w{2,4})$/, '');
|
||||
const parsedSub = parsePath(subtitle.title || subtitle.path);
|
||||
const byFileName = parsedVideos.filter(video => subTitle.includes(video.fileName));
|
||||
if (byFileName.length === 1) {
|
||||
return byFileName.map(v => v.videoFile);
|
||||
}
|
||||
const byTitleSeasonEpisode = parsedVideos.filter(video => video.title === parsedSub.title
|
||||
&& arrayEquals(video.seasons, parsedSub.seasons)
|
||||
&& arrayEquals(video.episodes, parsedSub.episodes));
|
||||
if (singleVideoFile(byTitleSeasonEpisode)) {
|
||||
return byTitleSeasonEpisode.map(v => v.videoFile);
|
||||
}
|
||||
const bySeasonEpisode = parsedVideos.filter(video => arrayEquals(video.seasons, parsedSub.seasons)
|
||||
&& arrayEquals(video.episodes, parsedSub.episodes));
|
||||
if (singleVideoFile(bySeasonEpisode)) {
|
||||
return bySeasonEpisode.map(v => v.videoFile);
|
||||
}
|
||||
const byTitle = parsedVideos.filter(video => video.title && video.title === parsedSub.title);
|
||||
if (singleVideoFile(byTitle)) {
|
||||
return byTitle.map(v => v.videoFile);
|
||||
}
|
||||
const byEpisode = parsedVideos.filter(video => arrayEquals(video.episodes, parsedSub.episodes));
|
||||
if (singleVideoFile(byEpisode)) {
|
||||
return byEpisode.map(v => v.videoFile);
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function singleVideoFile(videos) {
|
||||
return new Set(videos.map(v => v.videoFile.fileIndex)).size === 1;
|
||||
}
|
||||
|
||||
function parsePath(path) {
|
||||
const pathParts = path.split('/').map(part => parseFilename(part));
|
||||
const parsedWithEpisode = pathParts.find(parsed => parsed.season && parsed.episodes);
|
||||
return parsedWithEpisode || pathParts[pathParts.length - 1];
|
||||
}
|
||||
|
||||
function parseFilename(filename) {
|
||||
const parsedInfo = parse(filename)
|
||||
const titleEpisode = parsedInfo.title.match(/(\d+)$/);
|
||||
if (!parsedInfo.episodes && titleEpisode) {
|
||||
parsedInfo.episodes = [parseInt(titleEpisode[1], 10)];
|
||||
}
|
||||
return parsedInfo;
|
||||
}
|
||||
|
||||
function arrayEquals(array1, array2) {
|
||||
if (!array1 || !array2) return array1 === array2;
|
||||
return array1.length === array2.length && array1.every((value, index) => value === array2[index])
|
||||
}
|
||||
|
||||
module.exports = { assignSubtitles }
|
||||
5
scraper/lib/types.js
Normal file
5
scraper/lib/types.js
Normal file
@@ -0,0 +1,5 @@
|
||||
exports.Type = {
|
||||
MOVIE: 'movie',
|
||||
SERIES: 'series',
|
||||
ANIME: 'anime'
|
||||
};
|
||||
145
scraper/manual/manual.js
Normal file
145
scraper/manual/manual.js
Normal file
@@ -0,0 +1,145 @@
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const Promises = require('../lib/promises');
|
||||
const { mostCommonValue } = require('../lib/promises');
|
||||
const repository = require('../lib/repository');
|
||||
const { getImdbId, getKitsuId } = require('../lib/metadata');
|
||||
const { parseTorrentFiles } = require('../lib/torrentFiles');
|
||||
const { createTorrentContents } = require('../lib/torrentEntries');
|
||||
const { assignSubtitles } = require('../lib/torrentSubtitles');
|
||||
const { Type } = require('../lib/types');
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||
|
||||
async function updateMovieCollections() {
|
||||
const collectionFiles = await repository.getFilesBasedOnTitle('logy')
|
||||
.then(files => files.filter(file => file.fileIndex === null))
|
||||
.then(files => files.filter(file => parse(file.title).complete));
|
||||
|
||||
collectionFiles.map(original => repository.getTorrent({ infoHash: original.infoHash })
|
||||
.then(torrent => parseTorrentFiles({ ...torrent.get(), imdbId: original.imdbId }))
|
||||
.then(files => Promise.all(files.map(file => {
|
||||
console.log(file);
|
||||
return repository.createFile(file)
|
||||
})))
|
||||
.then(createdFiled => {
|
||||
if (createdFiled && createdFiled.length) {
|
||||
console.log(`Updated movie collection ${original.title}`);
|
||||
repository.deleteFile(original)
|
||||
} else {
|
||||
console.log(`Failed updating movie collection ${original.title}`);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
async function reapplySeriesSeasonsSavedAsMovies() {
|
||||
return repository.getTorrentsBasedOnTitle('(?:[^a-zA-Z0-9]|^)[Ss][012]?[0-9](?:[^0-9]|$)', Type.MOVIE)
|
||||
.then(torrents => Promise.all(torrents
|
||||
.filter(torrent => parse(torrent.title).seasons)
|
||||
.map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, false)
|
||||
.then(() => {
|
||||
torrent.type = Type.SERIES;
|
||||
return torrent.save();
|
||||
})))))
|
||||
.then(() => console.log('Finished updating multiple torrents'));
|
||||
}
|
||||
|
||||
async function reapplyDecomposingToTorrentsOnRegex(regex) {
|
||||
return repository.getTorrentsBasedOnTitle(regex, Type.ANIME)
|
||||
.then(torrents => Promise.all(torrents
|
||||
.map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, true)))))
|
||||
.then(() => console.log('Finished updating multiple torrents'));
|
||||
}
|
||||
|
||||
async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
|
||||
const torrent = await repository.getTorrent({ infoHash });
|
||||
const storedFiles = await repository.getFiles({ infoHash });
|
||||
const fileIndexMap = storedFiles
|
||||
.reduce((map, next) => {
|
||||
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
|
||||
map[fileIndex] = (map[fileIndex] || []).concat(next);
|
||||
return map;
|
||||
}, {});
|
||||
const files = includeSourceFiles && Object.values(fileIndexMap)
|
||||
.map(sameIndexFiles => sameIndexFiles[0])
|
||||
.map(file => ({
|
||||
fileIndex: file.fileIndex,
|
||||
name: file.title.replace(/.*\//, ''),
|
||||
path: file.title,
|
||||
size: file.size
|
||||
}));
|
||||
const kitsuId = undefined;
|
||||
const imdbId = kitsuId
|
||||
? undefined
|
||||
: mostCommonValue(storedFiles.map(file => file.imdbId))
|
||||
|| await getImdbId(parse(torrent.title)).catch(() => undefined);
|
||||
|
||||
if (!imdbId && !kitsuId) {
|
||||
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
return parseTorrentFiles({ ...torrent.get(), imdbId, kitsuId, files })
|
||||
.then(torrentContents => torrentContents.videos)
|
||||
.then(newFiles => newFiles.map(file => {
|
||||
const fileIndex = file.fileIndex !== undefined ? file.fileIndex : null;
|
||||
const mapping = fileIndexMap[fileIndex];
|
||||
if (mapping) {
|
||||
const originalFile = mapping.shift();
|
||||
if (originalFile) {
|
||||
if (!originalFile.imdbId) {
|
||||
originalFile.imdbId = file.imdbId
|
||||
}
|
||||
originalFile.imdbSeason = file.imdbSeason;
|
||||
originalFile.imdbEpisode = file.imdbEpisode;
|
||||
originalFile.kitsuId = file.kitsuId;
|
||||
originalFile.kitsuEpisode = file.kitsuEpisode;
|
||||
return originalFile;
|
||||
}
|
||||
}
|
||||
return file;
|
||||
}))
|
||||
.then(updatedFiles => Promise.all(updatedFiles
|
||||
.map(file => file.id ? file.save() : repository.createFile(file))))
|
||||
.then(() => console.log(`Updated files for [${torrent.infoHash}] ${torrent.title}`));
|
||||
}
|
||||
|
||||
async function assignSubs() {
|
||||
const unassignedSubs = await repository.getUnassignedSubtitles()
|
||||
.then(subs => subs.reduce((map, sub) => {
|
||||
map[sub.infoHash] = (map[sub.infoHash] || []).concat(sub);
|
||||
return map;
|
||||
}, {}));
|
||||
const infoHashes = Object.keys(unassignedSubs);
|
||||
|
||||
return Promise.all(infoHashes.map(async infoHash => {
|
||||
const videos = await repository.getFiles({ infoHash });
|
||||
const subtitles = unassignedSubs[infoHash];
|
||||
const assignedContents = assignSubtitles({ videos, subtitles });
|
||||
return Promise.all(assignedContents.videos
|
||||
.filter(video => video.subtitles)
|
||||
.map(video => repository.upsertSubtitles(video, video.subtitles)));
|
||||
}));
|
||||
}
|
||||
|
||||
async function openTorrentContents() {
|
||||
const limiter = new Bottleneck({ maxConcurrent: 15 });
|
||||
const unopenedTorrents = await repository.getNoContentsTorrents();
|
||||
|
||||
return Promise.all(unopenedTorrents.map(torrent => limiter.schedule(() => createTorrentContents(torrent))))
|
||||
.then(() => unopenedTorrents.length === 500 ? openTorrentContents() : Promise.resolve)
|
||||
}
|
||||
|
||||
// const infoHashes = [
|
||||
// ]
|
||||
// Promises.sequence(infoHashes.map(infoHash => () => reapplyEpisodeDecomposing(infoHash)))
|
||||
// .then(() => console.log('Finished'));
|
||||
|
||||
//findAllFiles().then(() => console.log('Finished'));
|
||||
//updateMovieCollections().then(() => console.log('Finished'));
|
||||
reapplyEpisodeDecomposing('96cc18f564f058384c18b4966a183d81808ce3fb', true).then(() => console.log('Finished'));
|
||||
//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished'));
|
||||
//reapplyDecomposingToTorrentsOnRegex('.*Title.*').then(() => console.log('Finished'));
|
||||
//reapplyManualHashes().then(() => console.log('Finished'));
|
||||
// assignSubs().then(() => console.log('Finished'));
|
||||
// openTorrentContents().then(() => console.log('Finished'));
|
||||
4269
scraper/package-lock.json
generated
Normal file
4269
scraper/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
49
scraper/package.json
Normal file
49
scraper/package.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"name": "stremio-torrentio",
|
||||
"version": "1.0.14",
|
||||
"exports": "./index.js",
|
||||
"type": "commonjs",
|
||||
"scripts": {
|
||||
"start": "node index.js"
|
||||
},
|
||||
"author": "TheBeastLT <pauliox@beyond.lt>",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@putdotio/api-client": "^8.42.0",
|
||||
"all-debrid-api": "^1.1.0",
|
||||
"axios": "^1.6.1",
|
||||
"bottleneck": "^2.19.5",
|
||||
"cache-manager": "^3.4.4",
|
||||
"cache-manager-mongodb": "^0.3.0",
|
||||
"cors": "^2.8.5",
|
||||
"debrid-link-api": "^1.0.1",
|
||||
"express-rate-limit": "^6.7.0",
|
||||
"google-it": "^1.6.4",
|
||||
"google-sr": "^3.2.1",
|
||||
"he": "^1.2.0",
|
||||
"jaro-winkler": "^0.2.8",
|
||||
"magnet-uri": "^6.2.0",
|
||||
"name-to-imdb": "^3.0.4",
|
||||
"named-queue": "^2.2.1",
|
||||
"needle": "^3.3.1",
|
||||
"node-schedule": "^2.1.1",
|
||||
"nodejs-bing": "^0.1.0",
|
||||
"nyaapi": "^2.4.4",
|
||||
"offcloud-api": "^1.0.2",
|
||||
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#022408972c2a040f846331a912a6a8487746a654",
|
||||
"pg": "^8.11.3",
|
||||
"pg-hstore": "^2.3.4",
|
||||
"premiumize-api": "^1.0.3",
|
||||
"prom-client": "^12.0.0",
|
||||
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#d1f7eaa8593b947edbfbc8a92a176448b48ef445",
|
||||
"request-ip": "^3.3.0",
|
||||
"router": "^1.3.8",
|
||||
"sequelize": "^6.31.1",
|
||||
"stremio-addon-sdk": "^1.6.10",
|
||||
"sugar-date": "^2.0.6",
|
||||
"swagger-stats": "^0.99.7",
|
||||
"torrent-stream": "^1.2.1",
|
||||
"ua-parser-js": "^1.0.36",
|
||||
"user-agents": "^1.0.1444"
|
||||
}
|
||||
}
|
||||
14
scraper/scheduler/scheduler.js
Normal file
14
scraper/scheduler/scheduler.js
Normal file
@@ -0,0 +1,14 @@
|
||||
const { scheduleScraping, scrapeAll } = require('./scraper')
|
||||
const { scheduleUpdateSeeders, scheduleUpdateSeedersForNewTorrents } = require('./seeders')
|
||||
|
||||
function startScraper() {
|
||||
if (process.env.ENABLE_SCHEDULING) {
|
||||
scheduleScraping();
|
||||
scheduleUpdateSeeders();
|
||||
scheduleUpdateSeedersForNewTorrents();
|
||||
} else {
|
||||
scrapeAll()
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { startScraper }
|
||||
28
scraper/scheduler/scraper.js
Normal file
28
scraper/scheduler/scraper.js
Normal file
@@ -0,0 +1,28 @@
|
||||
const schedule = require('node-schedule');
|
||||
const scrapers = require('./scrapers');
|
||||
const { sequence } = require('../lib/promises')
|
||||
|
||||
function scheduleScraping() {
|
||||
const allCrons = scrapers.reduce((crons, provider) => {
|
||||
crons[provider.cron] = (crons[provider.cron] || []).concat(provider)
|
||||
return crons;
|
||||
}, {});
|
||||
Object.entries(allCrons).forEach(([cron, providers]) => schedule.scheduleJob(cron, () => _scrapeProviders(providers)))
|
||||
}
|
||||
|
||||
function scrapeAll() {
|
||||
return _scrapeProviders(scrapers)
|
||||
}
|
||||
|
||||
async function _scrapeProviders(providers) {
|
||||
return sequence(providers.map(provider => () => _singleScrape(provider)));
|
||||
}
|
||||
|
||||
async function _singleScrape(provider) {
|
||||
return provider.scraper.scrape().catch(error => {
|
||||
console.warn(`Failed ${provider.name} scraping due: `, error);
|
||||
return Promise.resolve()
|
||||
})
|
||||
}
|
||||
|
||||
module.exports = { scheduleScraping, scrapeAll }
|
||||
45
scraper/scheduler/scrapers.js
Normal file
45
scraper/scheduler/scrapers.js
Normal file
@@ -0,0 +1,45 @@
|
||||
const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper');
|
||||
const thepiratebayFakeRemoval = require('../scrapers/thepiratebay/thepiratebay_fakes_removal');
|
||||
const ytsScraper = require('../scrapers/yts/yts_scraper');
|
||||
const eztvScraper = require('../scrapers/eztv/eztv_scraper');
|
||||
const leetxScraper = require('../scrapers/1337x/1337x_scraper');
|
||||
const kickassScraper = require('../scrapers/kickass/kickass_scraper');
|
||||
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
|
||||
const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper');
|
||||
const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper');
|
||||
const erairawsScraper = require('../scrapers/erairaws/erairaws_scraper');
|
||||
const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
|
||||
const rutorScraper = require('../scrapers/rutor/rutor_scraper');
|
||||
const Comando = require('../scrapers/comando/comando_scraper')
|
||||
const ComoEuBaixo = require('../scrapers/comoeubaixo/comoeubaixo_scraper')
|
||||
const Lapumia = require('../scrapers/lapumia/lapumia_scraper')
|
||||
const OndeBaixa = require('../scrapers/ondebaixa/ondebaixa_scraper');
|
||||
const AnimesTorrent = require('../scrapers/animestorrent/animestorrent_scraper')
|
||||
const DarkMahou = require('../scrapers/darkmahou/darkmahou_scraper')
|
||||
|
||||
module.exports = [
|
||||
{ scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: eztvScraper, name: eztvScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */1 ? * *' },
|
||||
{ scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' },
|
||||
{ scraper: thepiratebayFakeRemoval, name: thepiratebayFakeRemoval.NAME, cron: '0 0 */12 ? * *' },
|
||||
{ scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
// { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: Comando, name: Comando.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: ComoEuBaixo, name: ComoEuBaixo.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: Lapumia, name: Lapumia.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: OndeBaixa, name: OndeBaixa.NAME, cron: '0 0 */4 ? * *' },
|
||||
// { scraper: AnimesTorrent, name: AnimesTorrent.NAME, cron: '0 0 */4 ? * *' },
|
||||
// { scraper: DarkMahou, name: DarkMahou.NAME, cron: '0 0 */4 ? * *' },
|
||||
{ scraper: erairawsScraper, name: erairawsScraper.NAME, cron: '0 0 */24 ? * *' },
|
||||
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
|
||||
// { scraper: require('../scrapers/1337x/1337x_search_scraper') }
|
||||
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') }
|
||||
// { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') }
|
||||
];
|
||||
54
scraper/scheduler/seeders.js
Normal file
54
scraper/scheduler/seeders.js
Normal file
@@ -0,0 +1,54 @@
|
||||
const Bottleneck = require('bottleneck');
|
||||
const repository = require('../lib/repository')
|
||||
const { delay } = require('../lib/promises')
|
||||
const { updateCurrentSeeders } = require('../lib/torrent')
|
||||
const { updateTorrentSeeders } = require('../lib/torrentEntries')
|
||||
|
||||
const DELAY_MS = 0; // 0 seconds
|
||||
const updateLimiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
const statistics = {};
|
||||
const statisticsNew = {};
|
||||
|
||||
function scheduleUpdateSeeders() {
|
||||
console.log('Starting seeders update...')
|
||||
getTorrents()
|
||||
.then(torrents => updateCurrentSeeders(torrents))
|
||||
.then(updatedTorrents => Promise.all(
|
||||
updatedTorrents.map(updated => updateLimiter.schedule(() => updateTorrentSeeders(updated)))))
|
||||
.then(torrents => updateStatistics(torrents, statistics))
|
||||
.then(() => console.log('Finished seeders update:', statistics))
|
||||
.catch(error => console.warn('Failed seeders update:', error))
|
||||
.then(() => delay(DELAY_MS))
|
||||
.then(() => scheduleUpdateSeeders());
|
||||
}
|
||||
|
||||
function scheduleUpdateSeedersForNewTorrents() {
|
||||
console.log('Starting seeders update for new torrents...')
|
||||
getNewTorrents()
|
||||
.then(torrents => updateCurrentSeeders(torrents))
|
||||
.then(updatedTorrents => Promise.all(
|
||||
updatedTorrents.map(updated => updateLimiter.schedule(() => updateTorrentSeeders(updated)))))
|
||||
.then(torrents => updateStatistics(torrents, statisticsNew))
|
||||
.then(() => console.log('Finished seeders update for new torrents:', statisticsNew))
|
||||
.catch(error => console.warn('Failed seeders update for new torrents:', error))
|
||||
.then(() => delay(30_000))
|
||||
.then(() => scheduleUpdateSeedersForNewTorrents());
|
||||
}
|
||||
|
||||
async function getTorrents() {
|
||||
return repository.getUpdateSeedersTorrents()
|
||||
.catch(() => delay(5000).then(() => getTorrents()))
|
||||
}
|
||||
|
||||
async function getNewTorrents() {
|
||||
return repository.getUpdateSeedersNewTorrents()
|
||||
.catch(() => delay(5000).then(() => getNewTorrents()))
|
||||
}
|
||||
|
||||
function updateStatistics(updatedTorrents, statisticsObject) {
|
||||
const totalTorrents = updatedTorrents.map(nested => nested.length).reduce((a, b) => a + b, 0);
|
||||
const date = new Date().toISOString().replace(/T.*/, '');
|
||||
statisticsObject[date] = (statisticsObject[date] || 0) + totalTorrents;
|
||||
}
|
||||
|
||||
module.exports = { scheduleUpdateSeeders, scheduleUpdateSeedersForNewTorrents }
|
||||
182
scraper/scrapers/1337x/1337x_api.js
Normal file
182
scraper/scrapers/1337x/1337x_api.js
Normal file
@@ -0,0 +1,182 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const Sugar = require('sugar-date');
|
||||
const decode = require('magnet-uri');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
|
||||
const defaultProxies = [
|
||||
'https://1337x.to'
|
||||
];
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50;
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'Movies',
|
||||
TV: 'TV',
|
||||
ANIME: 'Anime',
|
||||
DOCUMENTARIES: 'Documentaries',
|
||||
APPS: 'Apps',
|
||||
GAMES: 'Games',
|
||||
MUSIC: 'Music',
|
||||
PORN: 'XXX',
|
||||
OTHER: 'Other',
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const slug = torrentId.startsWith('/torrent/') ? torrentId.replace('/torrent/', '') : torrentId;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${slug}`, config)))
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => ({ torrentId: slug, ...torrent }))
|
||||
.catch((err) => torrent(slug, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
|
||||
const requestUrl = proxyUrl => category
|
||||
? `${proxyUrl}/category-search/${keyword}/${category}/${page}/`
|
||||
: `${proxyUrl}/search/${keyword}/${page}/`;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map(proxyUrl => singleRequest(requestUrl(proxyUrl), config)))
|
||||
.then(body => parseTableBody(body))
|
||||
.then(torrents => torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const sort = config.sort;
|
||||
const requestUrl = proxyUrl => sort
|
||||
? `${proxyUrl}/sort-cat/${category}/${sort}/desc/${page}/`
|
||||
: `${proxyUrl}/cat/${category}/${page}/`;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)))
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden') ||
|
||||
!(body.includes('1337x</title>'))) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('.table > tbody > tr').each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find('a').eq(1).text(),
|
||||
torrentId: row.find('a').eq(1).attr('href').replace('/torrent/', ''),
|
||||
seeders: parseInt(row.children('td.coll-2').text()),
|
||||
leechers: parseInt(row.children('td.coll-3').text()),
|
||||
size: parseSize(row.children('td.coll-4').text())
|
||||
});
|
||||
});
|
||||
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const details = $('.torrent-detail-page');
|
||||
const magnetLink = details.find('a:contains(\'Magnet Download\')').attr('href');
|
||||
const imdbIdMatch = details.find('div[id=\'description\']').html().match(/imdb\.com\/title\/(tt\d+)/i);
|
||||
|
||||
const torrent = {
|
||||
name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
seeders: parseInt(details.find('strong:contains(\'Seeders\')').next().text(), 10),
|
||||
leechers: parseInt(details.find('strong:contains(\'Leechers\')').next().text(), 10),
|
||||
category: details.find('strong:contains(\'Category\')').next().text(),
|
||||
languages: details.find('strong:contains(\'Language\')').next().text(),
|
||||
size: parseSize(details.find('strong:contains(\'Total size\')').next().text()),
|
||||
uploadDate: parseDate(details.find('strong:contains(\'Date uploaded\')').next().text()),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1],
|
||||
files: details.find('div[id=\'files\']').first().find('li')
|
||||
.map((i, elem) => $(elem).text())
|
||||
.map((i, text) => ({
|
||||
fileIndex: i,
|
||||
name: text.match(/^(.+)\s\(.+\)$/)[1].replace(/^.+\//g, ''),
|
||||
path: text.match(/^(.+)\s\(.+\)$/)[1],
|
||||
size: parseSize(text.match(/^.+\s\((.+)\)$/)[1])
|
||||
})).get()
|
||||
};
|
||||
resolve(torrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseDate(dateString) {
|
||||
if (/decade.*ago/i.test(dateString)) {
|
||||
return Sugar.Date.create('10 years ago');
|
||||
}
|
||||
return Sugar.Date.create(dateString);
|
||||
}
|
||||
|
||||
function parseSize(sizeText) {
|
||||
if (!sizeText) {
|
||||
return undefined;
|
||||
}
|
||||
let scale = 1;
|
||||
if (sizeText.includes('GB')) {
|
||||
scale = 1024 * 1024 * 1024
|
||||
} else if (sizeText.includes('MB')) {
|
||||
scale = 1024 * 1024;
|
||||
} else if (sizeText.includes('KB')) {
|
||||
scale = 1024;
|
||||
}
|
||||
return Math.floor(parseFloat(sizeText.replace(/,/g, '')) * scale);
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
111
scraper/scrapers/1337x/1337x_scraper.js
Normal file
111
scraper/scrapers/1337x/1337x_scraper.js
Normal file
@@ -0,0 +1,111 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const leetx = require('./1337x_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = '1337x';
|
||||
const UNTIL_PAGE = 10;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => leetx.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
leetx.Categories.MOVIE,
|
||||
leetx.Categories.TV,
|
||||
leetx.Categories.ANIME,
|
||||
leetx.Categories.DOCUMENTARIES
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return leetx.browse(({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...record })) {
|
||||
return record;
|
||||
}
|
||||
|
||||
const torrentFound = await leetx.torrent(record.torrentId).catch(() => undefined);
|
||||
|
||||
if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
if (isNaN(torrentFound.uploadDate)) {
|
||||
console.warn(`Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}`);
|
||||
return;
|
||||
}
|
||||
if (await checkAndUpdateTorrent(torrentFound)) {
|
||||
return torrentFound;
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: torrentFound.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: torrentFound.torrentId,
|
||||
title: torrentFound.name.replace(/\t|\s+/g, ' ').trim(),
|
||||
type: TYPE_MAPPING[torrentFound.category],
|
||||
size: torrentFound.size,
|
||||
seeders: torrentFound.seeders,
|
||||
uploadDate: torrentFound.uploadDate,
|
||||
imdbId: torrentFound.imdbId,
|
||||
languages: torrentFound.languages || undefined
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[leetx.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[leetx.Categories.DOCUMENTARIES] = Type.SERIES;
|
||||
mapping[leetx.Categories.TV] = Type.SERIES;
|
||||
mapping[leetx.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (leetx.Categories.ANIME === category) {
|
||||
return 5;
|
||||
}
|
||||
if (leetx.Categories.DOCUMENTARIES === category) {
|
||||
return 1;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
133
scraper/scrapers/animestorrent/animestorrent_api.js
Normal file
133
scraper/scrapers/animestorrent/animestorrent_api.js
Normal file
@@ -0,0 +1,133 @@
|
||||
const needle = require("needle");
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require("magnet-uri");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { getRandomUserAgent } = require("../../lib/requestHelper");
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50;
|
||||
|
||||
const baseUrl = 'https://animestorrent.com';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'filme',
|
||||
ANIME: 'tv',
|
||||
OVA: 'ova'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = torrentId.split("/")[3];
|
||||
return singleRequest(`${baseUrl}/${slug}`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => torrent(slug, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1);
|
||||
|
||||
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.then((torrents) =>
|
||||
torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 })
|
||||
.catch(() => [])
|
||||
.then((nextTorrents) => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const requestUrl = category ? `${baseUrl}/tipo/${category}/page/${page}/` : `${baseUrl}/page/${page}/`;
|
||||
|
||||
return singleRequest(requestUrl, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = {
|
||||
userAgent: getRandomUserAgent(),
|
||||
open_timeout: timeout,
|
||||
follow: 2,
|
||||
};
|
||||
|
||||
return needle("get", requestUrl, options).then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (
|
||||
body.includes("502: Bad gateway") ||
|
||||
body.includes("403 Forbidden")
|
||||
) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$("article.bs").each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("span.ntitle").text(),
|
||||
torrentId: row.find("div > a").attr("href"),
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
let magnets = [];
|
||||
$(`a[href^="magnet"]`).each((i, section) => {
|
||||
const magnet = $(section).attr("href");
|
||||
magnets.push(magnet);
|
||||
});
|
||||
const details = $('div.infox')
|
||||
const torrents = magnets.map((magnetLink) => {
|
||||
return {
|
||||
title: decode(magnetLink).name,
|
||||
originalName: details.find('h1.entry-title').text(),
|
||||
year: details.find('b:contains(\'Lançamento:\')')[0]
|
||||
? details.find('b:contains(\'Lançamento:\')')[0].nextSibling.nodeValue.trim()
|
||||
: '',
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4],
|
||||
uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")),
|
||||
};
|
||||
})
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
108
scraper/scrapers/animestorrent/animestorrent_scraper.js
Normal file
108
scraper/scrapers/animestorrent/animestorrent_scraper.js
Normal file
@@ -0,0 +1,108 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const animetorrrent = require("./animestorrent_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
const { getKitsuId } = require("../../lib/metadata");
|
||||
|
||||
const NAME = "AnimesTorrent";
|
||||
const UNTIL_PAGE = 5;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => animetorrrent.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
animetorrrent.Categories.MOVIE,
|
||||
animetorrrent.Categories.ANIME,
|
||||
animetorrrent.Categories.OVA
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return animetorrrent
|
||||
.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
return animetorrrent.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
if (!foundTorrent.imdbId && !foundTorrent.kitsuId) {
|
||||
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
|
||||
foundTorrent.kitsuId = await getKitsuId(info).catch(() => undefined);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: Type.ANIME,
|
||||
imdbId: foundTorrent.imdbId,
|
||||
kitsuId: foundTorrent.kitsuId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (animetorrrent.Categories.ANIME === category) {
|
||||
return 5;
|
||||
}
|
||||
if (animetorrrent.Categories.OVA === category) {
|
||||
return 3;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
161
scraper/scrapers/comando/comando_api.js
Normal file
161
scraper/scrapers/comando/comando_api.js
Normal file
@@ -0,0 +1,161 @@
|
||||
const needle = require("needle")
|
||||
const moment = require("moment")
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require('magnet-uri');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
const { isPtDubbed, sanitizePtName, sanitizePtLanguages, sanitizePtOriginalName } = require('../scraperHelper')
|
||||
|
||||
const defaultTimeout = 30000;
|
||||
const maxSearchPage = 50
|
||||
|
||||
const baseUrl = 'https://comando.to';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'filmes',
|
||||
TV: 'series',
|
||||
ANIME: 'animes',
|
||||
DOCUMENTARIES: 'documentario'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = torrentId.split("/")[3];
|
||||
return singleRequest(`${baseUrl}/${slug}`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => {
|
||||
console.warn(`Failed Comando ${slug} request: `, err);
|
||||
return torrent(torrentId, config, retries - 1)
|
||||
});
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
|
||||
|
||||
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
|
||||
.then(body => parseTableBody(body))
|
||||
.then(torrents => torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return singleRequest(`${baseUrl}/category/${category}/page/${page}/`, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('article').each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("h2 > a").text(),
|
||||
torrentId: row.find("h2 > a").attr("href")
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const magnets = $('h2 > strong')
|
||||
.filter((i, elem) => isPtDubbed($(elem).text())).parent()
|
||||
.map((i, elem) => $(elem).nextUntil('h2, hr'))
|
||||
.map((i, elem) => $(elem).find('a[href^="magnet"]'))
|
||||
.map((i, section) => $(section).attr("href")).get();
|
||||
const details = $('strong, b').filter((i, elem) => $(elem).text().match(/Servidor|Orig(?:\.|inal)/)).parent();
|
||||
const imdbIdMatch = details.find('a[href*="imdb.com"]').attr('href')
|
||||
const torrents = magnets.map(magnetLink => {
|
||||
const decodedMagnet = decode(magnetLink);
|
||||
const originalNameElem = details.find('strong, b')
|
||||
.filter((i, elem) => $(elem).text().match(/Baixar|Orig(?:\.|inal)|^Título:/));
|
||||
const languagesElem = details.find('strong, b')
|
||||
.filter((i, elem) => $(elem).text().match(/^\s*([IÍ]dioma|[AÁ]udio)/));
|
||||
const originalName = parseOriginalName(originalNameElem);
|
||||
const title = decodedMagnet.name && escapeHTML(decodedMagnet.name.replace(/\+/g, ' '));
|
||||
return {
|
||||
title: title ? sanitizePtName(title) : originalName,
|
||||
originalName: sanitizePtOriginalName(originalName),
|
||||
year: details.find('a[href*="comando.to/category/"]').text(),
|
||||
infoHash: decodedMagnet.infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: parseCategory($('div.entry-categories').html()),
|
||||
uploadDate: new Date(moment($('a.updated').text(), 'LL', 'pt-br').format()),
|
||||
imdbId: imdbIdMatch ? imdbIdMatch.split('/')[4] : null,
|
||||
languages: sanitizePtLanguages(languagesElem[0].nextSibling.nodeValue)
|
||||
}
|
||||
});
|
||||
resolve(torrents.filter((x) => x));
|
||||
});
|
||||
}
|
||||
|
||||
function parseOriginalName(originalNameElem) {
|
||||
if (!originalNameElem[0]) {
|
||||
return '';
|
||||
}
|
||||
const originalName = originalNameElem.next().text().trim() || originalNameElem[0].nextSibling.nodeValue;
|
||||
return originalName.replace(/: ?/, '');
|
||||
}
|
||||
|
||||
function parseCategory(categorys) {
|
||||
const $ = cheerio.load(categorys)
|
||||
if ($('a:contains(\'animes\')').text()) {
|
||||
return Categories.ANIME
|
||||
}
|
||||
if ($('a:contains(\'Filmes\')').text()) {
|
||||
return Categories.MOVIE
|
||||
}
|
||||
if ($('a:contains(\'Series\')').text()) {
|
||||
return Categories.TV
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
112
scraper/scrapers/comando/comando_scraper.js
Normal file
112
scraper/scrapers/comando/comando_scraper.js
Normal file
@@ -0,0 +1,112 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const comando = require("./comando_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { getImdbId } = require("../../lib/metadata");
|
||||
|
||||
const NAME = "Comando";
|
||||
const UNTIL_PAGE = 5;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => comando.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
comando.Categories.MOVIE,
|
||||
comando.Categories.TV
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return comando.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
return comando.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
if (!foundTorrent.imdbId && foundTorrent.originalName) {
|
||||
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
|
||||
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
imdbId: foundTorrent.imdbId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[comando.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[comando.Categories.DOCUMENTARIES] = Type.SERIES;
|
||||
mapping[comando.Categories.TV] = Type.SERIES;
|
||||
mapping[comando.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (comando.Categories.TV === category) {
|
||||
return 5;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
135
scraper/scrapers/comoeubaixo/comoeubaixo_api.js
Normal file
135
scraper/scrapers/comoeubaixo/comoeubaixo_api.js
Normal file
@@ -0,0 +1,135 @@
|
||||
const needle = require("needle")
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require('magnet-uri');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
const { isPtDubbed, sanitizePtName, sanitizePtLanguages } = require('../scraperHelper')
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50
|
||||
|
||||
const baseUrl = 'https://comoeubaixo.com';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'filmes',
|
||||
TV: 'series',
|
||||
ANIME: 'anime',
|
||||
DESENHOS: 'desenhos'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = encodeURIComponent(torrentId.split("/")[3]);
|
||||
return singleRequest(`${baseUrl}/${slug}/`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => {
|
||||
console.warn(`Failed ComoEuBaixo ${torrentId} request: `, err);
|
||||
return torrent(torrentId, config, retries - 1)
|
||||
});
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
|
||||
|
||||
return singleRequest(`${baseUrl}/${keyword}/${page}/`, config)
|
||||
.then(body => parseTableBody(body))
|
||||
.then(torrents => torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const requestUrl = category ? `${baseUrl}/${category}/${page}/` : `${baseUrl}/${page}/`;
|
||||
|
||||
return singleRequest(requestUrl, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body || (Buffer.isBuffer(body) && !body.size)) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('div.capa_larga.align-middle').each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("a").text(),
|
||||
torrentId: row.find("a").attr("href"),
|
||||
isTorrent: !!row.find("p:contains(\'Torrent\')")[0]
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const magnets = $(`a[href^="magnet"]`)
|
||||
.filter((i, elem) => isPtDubbed($(elem).attr('title')))
|
||||
.map((i, elem) => $(elem).attr("href")).get();
|
||||
const details = $('div#informacoes')
|
||||
const category = details.find('strong:contains(\'Gêneros: \')').next().attr('href').split('/')[0]
|
||||
const torrents = magnets.map(magnetLink => {
|
||||
const decodedMagnet = decode(magnetLink);
|
||||
const name = escapeHTML(decodedMagnet.name || '').replace(/\+/g, ' ');
|
||||
const sanitizedTitle = sanitizePtName(name);
|
||||
const originalTitle = details.find('strong:contains(\'Baixar\')')[0].nextSibling.nodeValue.split('-')[0];
|
||||
const year = details.find('strong:contains(\'Data de Lançamento: \')').next().text().trim();
|
||||
const fallBackTitle = `${originalTitle.trim()} ${year.trim()} ${sanitizedTitle.trim()}`;
|
||||
return {
|
||||
title: sanitizedTitle.length > 5 ? sanitizedTitle : fallBackTitle,
|
||||
infoHash: decodedMagnet.infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: category,
|
||||
uploadDate: new Date($('time').attr('datetime')),
|
||||
imdbId: details.find('a[href*="imdb.com"]').attr('href').split('/')[4],
|
||||
languages: sanitizePtLanguages(details.find('strong:contains(\'Idioma\')')[0].nextSibling.nodeValue)
|
||||
};
|
||||
})
|
||||
resolve(torrents.filter((x) => x));
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
115
scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js
Normal file
115
scraper/scrapers/comoeubaixo/comoeubaixo_scraper.js
Normal file
@@ -0,0 +1,115 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const comoeubaixo = require("./comoeubaixo_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
|
||||
const NAME = "ComoEuBaixo";
|
||||
const UNTIL_PAGE = 5;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => comoeubaixo.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
comoeubaixo.Categories.MOVIE,
|
||||
comoeubaixo.Categories.TV,
|
||||
comoeubaixo.Categories.DESENHOS
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return comoeubaixo
|
||||
.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
if (!entry.isTorrent) {
|
||||
return entry;
|
||||
}
|
||||
return comoeubaixo.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
imdbId: foundTorrent.imdbId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[comoeubaixo.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[comoeubaixo.Categories.TV] = Type.SERIES;
|
||||
mapping[comoeubaixo.Categories.ANIME] = Type.ANIME;
|
||||
mapping[comoeubaixo.Categories.DESENHOS] = Type.SERIES;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (comoeubaixo.Categories.DESENHOS === category) {
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
if (comoeubaixo.Categories.TV === category) {
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
132
scraper/scrapers/darkmahou/darkmahou_api.js
Normal file
132
scraper/scrapers/darkmahou/darkmahou_api.js
Normal file
@@ -0,0 +1,132 @@
|
||||
const needle = require("needle");
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require("magnet-uri");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { getRandomUserAgent } = require("../../lib/requestHelper");
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50;
|
||||
|
||||
const baseUrl = 'https://darkmahou.com';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'movie',
|
||||
ANIME: 'tv',
|
||||
OVA: 'ova'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = torrentId.split("/")[3];
|
||||
return singleRequest(`${baseUrl}/${slug}`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => torrent(slug, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1);
|
||||
|
||||
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.then((torrents) =>
|
||||
torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 })
|
||||
.catch(() => [])
|
||||
.then((nextTorrents) => torrents.concat(nextTorrents))
|
||||
: torrents
|
||||
)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const requestUrl = category ? `${baseUrl}/category/${category}/page/${page}/` : `${baseUrl}/page/${page}/`;
|
||||
|
||||
return singleRequest(requestUrl, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = {
|
||||
userAgent: getRandomUserAgent(),
|
||||
open_timeout: timeout,
|
||||
follow: 2,
|
||||
};
|
||||
|
||||
return needle("get", requestUrl, options).then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (
|
||||
body.includes("502: Bad gateway") ||
|
||||
body.includes("403 Forbidden")
|
||||
) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$("article.bs").each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("span.ntitle").text(),
|
||||
torrentId: row.find("div > a").attr("href"),
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
let magnets = [];
|
||||
$(`a[href^="magnet"]`).each((i, section) => {
|
||||
const magnet = $(section).attr("href");
|
||||
magnets.push(magnet);
|
||||
});
|
||||
const details = $('div.infox')
|
||||
const torrent = magnets.map((magnetLink) => {
|
||||
return {
|
||||
title: decode(magnetLink).name,
|
||||
originalName: details.find('h1.entry-title').text(),
|
||||
year: details.find('b:contains(\'Lançado:\')')[0].nextSibling.nodeValue || '',
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4],
|
||||
uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")),
|
||||
};
|
||||
})
|
||||
resolve(torrent.filter((x) => x));
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
108
scraper/scrapers/darkmahou/darkmahou_scraper.js
Normal file
108
scraper/scrapers/darkmahou/darkmahou_scraper.js
Normal file
@@ -0,0 +1,108 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const darkmahou = require("./darkmahou_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
const { getKitsuId } = require("../../lib/metadata");
|
||||
|
||||
const NAME = "DarkMahou";
|
||||
const UNTIL_PAGE = 5;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => darkmahou.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
darkmahou.Categories.MOVIE,
|
||||
darkmahou.Categories.ANIME,
|
||||
darkmahou.Categories.OVA
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return darkmahou
|
||||
.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
return darkmahou.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
if (!foundTorrent.imdbId && !foundTorrent.kitsuId) {
|
||||
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
|
||||
foundTorrent.kitsuId = await getKitsuId(info).catch(() => undefined);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: Type.ANIME,
|
||||
imdbId: foundTorrent.imdbId,
|
||||
kitsuId: foundTorrent.kitsuId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (darkmahou.Categories.ANIME === category) {
|
||||
return 5;
|
||||
}
|
||||
if (darkmahou.Categories.OVA === category) {
|
||||
return 4;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
94
scraper/scrapers/erairaws/erairaws_api.js
Normal file
94
scraper/scrapers/erairaws/erairaws_api.js
Normal file
@@ -0,0 +1,94 @@
|
||||
const needle = require("needle");
|
||||
const axios = require('axios');
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require("magnet-uri");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { getRandomUserAgent } = require("../../lib/requestHelper");
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
const baseUrl = 'https://www.erai-raws.info';
|
||||
|
||||
const Categories = {
|
||||
ANIMES: 'anime',
|
||||
EPISODES: 'episodes'
|
||||
};
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return singleRequest(`${baseUrl}/${category}/page/${page}/`, config)
|
||||
.then((body) => parseTableBody(body)
|
||||
.then(animes => Promises.sequence(animes.map(anime => () => singleRequest(anime.animeLink))))
|
||||
.then(animeBodies => Promise.all(animeBodies.map(animeBody => parseTorrentPage(animeBody))))
|
||||
.then(animeInfos => animeInfos.reduce((a, b) => a.concat(b), [])))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), timeout: timeout, follow: 2, };
|
||||
|
||||
return axios.get(requestUrl, options).then((response) => {
|
||||
const body = response.data;
|
||||
if (!body || (Buffer.isBuffer(body) && !body.size)) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (
|
||||
body.includes("502: Bad gateway") ||
|
||||
body.includes("403 Forbidden")
|
||||
) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
|
||||
const links = $('[itemprop=\'headline\'] a, .content-area a.aa_ss_ops_new')
|
||||
.map((i, element) => ({
|
||||
name: $(element).text(),
|
||||
animeLink: $(element).attr("href"),
|
||||
})).get();
|
||||
resolve(links);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error("Failed loading body"));
|
||||
}
|
||||
const entries = $('.tab-content table, .content-area table')
|
||||
.map((i, entry) => {
|
||||
const languages = $(entry).find('.tooltip3').map((_, l) => $(l).attr('data-title')).get().join('/');
|
||||
const magnets = $(entry).find('a[href^="magnet"]').map((_, m) => $(m).attr('href')).get();
|
||||
return { languages, magnets }
|
||||
}).get();
|
||||
const torrents = entries
|
||||
.map(entry => entry.magnets
|
||||
.map(magnet => decode(magnet))
|
||||
.map(decodedMagnet => ({
|
||||
title: decodedMagnet.name,
|
||||
infoHash: decodedMagnet.infoHash,
|
||||
trackers: decodedMagnet.tr,
|
||||
languages: entry.languages
|
||||
})))
|
||||
.reduce((a, b) => a.concat(b), []);
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { browse, Categories };
|
||||
47
scraper/scrapers/erairaws/erairaws_scraper.js
Normal file
47
scraper/scrapers/erairaws/erairaws_scraper.js
Normal file
@@ -0,0 +1,47 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const erairaws = require('./erairaws_api');
|
||||
const { checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'EraiRaws';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return scrapeLatestTorrentsForCategory(erairaws.Categories.EPISODES)
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return erairaws.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processRecord(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve([]));
|
||||
}
|
||||
|
||||
async function processRecord(foundTorrent) {
|
||||
return checkAndUpdateTorrent({ provider: NAME, ...foundTorrent }).then(() => foundTorrent);
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (category === erairaws.Categories.ANIMES) {
|
||||
return 45;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
135
scraper/scrapers/eztv/eztv_api.js
Normal file
135
scraper/scrapers/eztv/eztv_api.js
Normal file
@@ -0,0 +1,135 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { getRandomUserAgent } = require('./../../lib/requestHelper');
|
||||
|
||||
const defaultProxies = [
|
||||
'https://eztv.re'
|
||||
];
|
||||
const defaultTimeout = 120000;
|
||||
const minDelay = 3000;
|
||||
const jitterDelay = minDelay;
|
||||
const limit = 100;
|
||||
const maxPage = 5;
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 1) {
|
||||
if (!torrentId) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/ep/${torrentId}`, config)))
|
||||
.then(body => parseTorrentPage(body))
|
||||
.then(torrent => ({ torrentId, ...torrent }))
|
||||
.catch(error => retries ? jitter().then(() => torrent(torrentId, config, retries - 1)) : Promise.reject(error));
|
||||
}
|
||||
|
||||
function search(imdbId, config = {}, retries = 1) {
|
||||
if (!imdbId) {
|
||||
return Promise.reject(new Error(`Failed ${imdbId} search`));
|
||||
}
|
||||
const id = imdbId.replace('tt', '');
|
||||
const page = config.page || 1;
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config)))
|
||||
.then(results => parseResults(results))
|
||||
.then(torrents => torrents.length === limit && page < maxPage
|
||||
? search(imdbId, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch(error => retries ? jitter().then(() => search(imdbId, config, retries - 1)) : Promise.reject(error));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 1) {
|
||||
const page = config.page || 1;
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}`, config)))
|
||||
.then(results => parseResults(results))
|
||||
.catch(error => retries ? jitter().then(() => browse(config, retries - 1)) : Promise.reject(error));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = {
|
||||
userAgent: getRandomUserAgent(),
|
||||
open_timeout: timeout,
|
||||
response_timeout: timeout,
|
||||
read_timeout: timeout,
|
||||
follow: 2
|
||||
};
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then(response => {
|
||||
if (!response.body) {
|
||||
return Promise.reject(`No body: ${requestUrl}`);
|
||||
}
|
||||
return Promise.resolve(response.body);
|
||||
});
|
||||
}
|
||||
|
||||
function parseResults(results) {
|
||||
if (!results || !Array.isArray(results.torrents)) {
|
||||
return Promise.reject(`Incorrect results ${results}`)
|
||||
}
|
||||
return results.torrents.map(torrent => parseTorrent(torrent));
|
||||
}
|
||||
|
||||
function parseTorrent(torrent) {
|
||||
return {
|
||||
name: torrent.title.replace(/EZTV$/, ''),
|
||||
torrentId: torrent.episode_url.replace(/.*\/ep\//, ''),
|
||||
infoHash: torrent.hash.trim().toLowerCase(),
|
||||
magnetLink: torrent.magnet_url,
|
||||
torrentLink: torrent.torrent_url,
|
||||
seeders: torrent.seeds,
|
||||
size: torrent.size_bytes,
|
||||
uploadDate: new Date(torrent.date_released_unix * 1000),
|
||||
imdbId: torrent.imdb_id !== '0' && 'tt' + torrent.imdb_id || undefined
|
||||
}
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const content = $('table[class="forum_header_border_normal"]');
|
||||
const torrent = {
|
||||
name: content.find('h1 > span').text().replace(/EZTV$/, ''),
|
||||
infoHash: content.find('b:contains(\'Torrent Hash:\')')[0].nextSibling.data.trim().toLowerCase(),
|
||||
magnetLink: content.find('a[title="Magnet Link"]').attr('href'),
|
||||
torrentLink: content.find('a[title="Download Torrent"]').attr('href'),
|
||||
seeders: parseInt(content.find('span[class="stat_red"]').first().text(), 10) || 0,
|
||||
size: parseSize(content.find('b:contains(\'Filesize:\')')[0].nextSibling.data),
|
||||
uploadDate: moment(content.find('b:contains(\'Released:\')')[0].nextSibling.data, 'Do MMM YYYY').toDate(),
|
||||
showUrl: content.find('.episode_left_column a').attr('href')
|
||||
};
|
||||
resolve(torrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseSize(sizeText) {
|
||||
if (!sizeText) {
|
||||
return undefined;
|
||||
}
|
||||
let scale = 1;
|
||||
if (sizeText.includes('GB')) {
|
||||
scale = 1024 * 1024 * 1024
|
||||
} else if (sizeText.includes('MB')) {
|
||||
scale = 1024 * 1024;
|
||||
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
|
||||
scale = 1024;
|
||||
}
|
||||
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
|
||||
}
|
||||
|
||||
function jitter() {
|
||||
return Promises.delay(minDelay + Math.round(Math.random() * jitterDelay))
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse };
|
||||
85
scraper/scrapers/eztv/eztv_scraper.js
Normal file
85
scraper/scrapers/eztv/eztv_scraper.js
Normal file
@@ -0,0 +1,85 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const eztv = require('./eztv_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
const { isEpisodeImdbId } = require('../../lib/metadata');
|
||||
|
||||
const NAME = 'EZTV';
|
||||
const UNTIL_PAGE = 10;
|
||||
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent, getImdbIdsMethod) {
|
||||
// return getImdbIdsMethod()
|
||||
// .then(imdbIds => Promise.all(imdbIds.map(imdbId => limiter.schedule(() => eztv.search(imdbId)))))
|
||||
// .then(results => results.reduce((a, b) => a.concat(b), []))
|
||||
// .catch(() => limiter.schedule(() => eztv.torrent(torrent.torrentId)));
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return scrapeLatestTorrentsForCategory();
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(page = 1) {
|
||||
console.log(`Scrapping ${NAME} page ${page}`);
|
||||
return eztv.browse(({ page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] due: `, error);
|
||||
// return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page))
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
if (!record || !record.size) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
|
||||
// imdb id for talk shows is usually incorrect on eztv
|
||||
const parsedTitle = parse(record.name);
|
||||
const dateEpisode = !parsedTitle.season && parsedTitle.date;
|
||||
if (dateEpisode && await isEpisodeImdbId(record.imdbId)) {
|
||||
delete record.imdbId;
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: record.torrentId,
|
||||
title: record.name.replace(/\t|\s+/g, ' ').trim(),
|
||||
type: Type.SERIES,
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: record.imdbId,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
137
scraper/scrapers/horriblesubs/horriblesubs_api.js
Normal file
137
scraper/scrapers/horriblesubs/horriblesubs_api.js
Normal file
@@ -0,0 +1,137 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const Promises = require('../../lib/promises');
|
||||
|
||||
const defaultUrl = 'https://horriblesubs.info';
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
function allShows(config = {}) {
|
||||
return _getContent('/shows/', config)
|
||||
.then(($) => $('div[class="ind-show"]')
|
||||
.map((index, element) => $(element).children('a'))
|
||||
.map((index, element) => ({
|
||||
title: element.attr('title'),
|
||||
url: `${config.proxyUrl || defaultUrl}${element.attr('href')}`
|
||||
})).get());
|
||||
}
|
||||
|
||||
async function showData(showInfo, config = {}) {
|
||||
const showEndpoint = (showInfo.url || showInfo).match(/\/show.+/)[0];
|
||||
const title = showInfo.title;
|
||||
const showId = await _getShowId(showEndpoint);
|
||||
const packEntries = await _getShowEntries(showId, title, 'batch', config);
|
||||
const singleEntries = await _getShowEntries(showId, title, 'show', config);
|
||||
|
||||
return {
|
||||
title: title,
|
||||
url: showInfo.url || showInfo,
|
||||
showId: showId,
|
||||
singleEpisodes: singleEntries,
|
||||
packEpisodes: packEntries
|
||||
};
|
||||
}
|
||||
|
||||
async function getLatestEntries(config = {}) {
|
||||
return _getAllLatestEntries(config)
|
||||
.then((entries) => Promises.sequence(entries.map((entry) => () => _findLatestEntry(entry, config))))
|
||||
.then((entries) => entries.filter((entry) => entry))
|
||||
}
|
||||
|
||||
function _getContent(endpoint, config = {},) {
|
||||
const baseUrl = config.proxyUrl || defaultUrl;
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const url = endpoint.startsWith('http')
|
||||
? endpoint.replace(/https?:\/\/[^/]+/, baseUrl)
|
||||
: `${baseUrl}${endpoint}`;
|
||||
|
||||
return needle('get', url, { open_timeout: timeout })
|
||||
.then((response) => response.body)
|
||||
.then((body) => cheerio.load(body));
|
||||
}
|
||||
|
||||
function _getShowId(showEndpoint) {
|
||||
return _getContent(showEndpoint.replace(/(?:#\d+)?\/?$/, '/'))
|
||||
.then($ => $('div.entry-content').find('script').html().match(/var hs_showid = (\d+)/)[1]);
|
||||
}
|
||||
|
||||
function _getShowEntries(animeId, animeTitle, type, config) {
|
||||
return _getAllEntries(animeId, type, config)
|
||||
.then((entries) => entries.filter((entry) => entry.title === animeTitle));
|
||||
}
|
||||
|
||||
function _getAllEntries(animeId, type, config, page = 0, autoExtend = true) {
|
||||
const entriesEndpoint = `/api.php?method=getshows&type=${type}&showid=${animeId}&nextid=${page}`;
|
||||
return _getEntries(entriesEndpoint, config)
|
||||
.then((entries) => !autoExtend || !entries.length ? entries :
|
||||
_getAllEntries(animeId, type, config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)));
|
||||
}
|
||||
|
||||
function _getEntries(endpoint, config) {
|
||||
return _getContent(endpoint, config)
|
||||
.then(($) => $('div[class="rls-info-container"]')
|
||||
.map((index, element) => ({
|
||||
title: $(element).find('a[class="rls-label"]').contents()
|
||||
.filter((i, el) => el.nodeType === 3).first().text().trim(),
|
||||
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
|
||||
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
|
||||
mirrors: $(element).find('div[class="rls-links-container"]').children()
|
||||
.map((indexLink, elementLink) => ({
|
||||
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
|
||||
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
|
||||
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
|
||||
})).get()
|
||||
})).get());
|
||||
}
|
||||
|
||||
function _getAllLatestEntries(config, page = 0) {
|
||||
const pageParam = page === 0 ? '' : `&nextid=${page}`;
|
||||
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
|
||||
return _getContent(entriesEndpoint, config)
|
||||
.then(($) => $('li a')
|
||||
.map((index, element) => ({
|
||||
urlEndpoint: $(element).attr('href'),
|
||||
episode: $(element).find('strong').text()
|
||||
})).get())
|
||||
.then((entries) => entries.length < 12
|
||||
? entries
|
||||
: _getAllLatestEntries(config, page + 1)
|
||||
.then((nextEntries) => entries.concat(nextEntries)));
|
||||
}
|
||||
|
||||
async function _findLatestEntry(entry, config) {
|
||||
const showId = await _getShowId(entry.urlEndpoint);
|
||||
let foundEntry;
|
||||
let page = 0;
|
||||
let reachedEnd = false;
|
||||
|
||||
while (!foundEntry && !reachedEnd) {
|
||||
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
|
||||
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
|
||||
page = page + 1;
|
||||
reachedEnd = allEntries.length === 0;
|
||||
}
|
||||
|
||||
if (!foundEntry) {
|
||||
return;
|
||||
}
|
||||
return {
|
||||
title: foundEntry.title,
|
||||
url: entry.urlEndpoint,
|
||||
showId: showId,
|
||||
singleEpisodes: [foundEntry]
|
||||
};
|
||||
}
|
||||
|
||||
function _parseDate(date) {
|
||||
if (date.match(/today/i)) {
|
||||
return moment().toDate();
|
||||
} else if (date.match(/yesterday/i)) {
|
||||
return moment().subtract(1, 'day').toDate();
|
||||
}
|
||||
return moment(date, 'MM/DD/YYYY').toDate();
|
||||
}
|
||||
|
||||
module.exports = { allShows, showData, getLatestEntries, _getShowId };
|
||||
|
||||
5636
scraper/scrapers/horriblesubs/horriblesubs_mapping.json
Normal file
5636
scraper/scrapers/horriblesubs/horriblesubs_mapping.json
Normal file
File diff suppressed because it is too large
Load Diff
186
scraper/scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
186
scraper/scrapers/horriblesubs/horriblesubs_scraper.js
Normal file
@@ -0,0 +1,186 @@
|
||||
const fs = require('fs');
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const decode = require('magnet-uri');
|
||||
const horriblesubs = require('./horriblesubs_api.js');
|
||||
const repository = require('../../lib/repository');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
const { getMetadata, getKitsuId } = require('../../lib/metadata');
|
||||
const showMappings = require('./horriblesubs_mapping.json');
|
||||
|
||||
const NAME = 'HorribleSubs';
|
||||
const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
const lastScraped = lastScrape.lastScraped && moment(lastScrape.lastScraped);
|
||||
|
||||
if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
|
||||
console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
|
||||
return _scrapeAllShows()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
|
||||
} else {
|
||||
console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
|
||||
return _scrapeLatestEntries()
|
||||
.then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
|
||||
}
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function _scrapeLatestEntries() {
|
||||
const latestEntries = await horriblesubs.getLatestEntries();
|
||||
|
||||
return Promise.all(latestEntries
|
||||
.map((entryData) => limiter.schedule(() => _parseShowData(entryData)
|
||||
.catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
async function _scrapeAllShows() {
|
||||
const shows = await horriblesubs.allShows();
|
||||
|
||||
return Promise.all(shows
|
||||
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
|
||||
.then((showData) => _parseShowData(showData, false))
|
||||
.catch((err) => console.log(err)))));
|
||||
}
|
||||
|
||||
async function compareSearchKitsuIds() {
|
||||
console.log(`${NAME}: initiating kitsu compare...`);
|
||||
const shows = await horriblesubs.allShows()
|
||||
.then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
|
||||
|
||||
const incorrect = shows.filter(
|
||||
(show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
|
||||
const incorrectRatio = incorrect.length / shows.length;
|
||||
console.log(incorrect);
|
||||
console.log(`Ratio: ${incorrectRatio}`);
|
||||
}
|
||||
|
||||
async function initMapping() {
|
||||
console.log(`${NAME}: initiating kitsu mapping...`);
|
||||
const shows = await horriblesubs.allShows()
|
||||
.then((shows) => shows.filter((show) => !showMappings[show.title]))
|
||||
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
|
||||
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
|
||||
|
||||
fs.writeFile(
|
||||
"./scraper/scrapers/horriblesubs/horriblesubs_mapping.json",
|
||||
JSON.stringify(shows), 'utf8',
|
||||
(err) => {
|
||||
if (err) {
|
||||
console.log("An error occurred while writing JSON Object to File.", err);
|
||||
} else {
|
||||
console.log(`${NAME}: finished kitsu mapping`);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
async function enrichShow(show) {
|
||||
console.log(`${NAME}: getting show info for ${show.title}...`);
|
||||
const showId = await horriblesubs._getShowId(show.url)
|
||||
.catch(() => show.title);
|
||||
const metadata = await getKitsuId({ title: show.title })
|
||||
.then((kitsuId) => getMetadata(kitsuId))
|
||||
.catch((error) => {
|
||||
console.log(`Failed getting kitsu meta: ${error.message}`);
|
||||
return {};
|
||||
});
|
||||
|
||||
return {
|
||||
showId: showId,
|
||||
kitsu_id: metadata.kitsuId,
|
||||
...show,
|
||||
kitsuTitle: metadata.title,
|
||||
imdb_id: metadata.imdbId
|
||||
}
|
||||
}
|
||||
|
||||
async function _parseShowData(showData, updateSeeders = true) {
|
||||
console.log(`${NAME}: scrapping ${showData.title} data...`);
|
||||
const showMapping = showMappings[showData.title];
|
||||
const kitsuId = showMapping && showMapping.kitsu_id;
|
||||
if (!showMapping) {
|
||||
throw new Error(`No kitsu mapping found for ${showData.title}`);
|
||||
}
|
||||
if (!kitsuId) {
|
||||
throw new Error(`No kitsuId found for ${showData.title}`);
|
||||
}
|
||||
|
||||
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
|
||||
const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
|
||||
.then((metas) => metas.reduce((map, meta) => {
|
||||
const epOffset = Object.keys(map).length;
|
||||
[...Array(meta.totalCount || 1).keys()]
|
||||
.map(ep => ep + 1)
|
||||
.forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
|
||||
return map;
|
||||
}, {})) || {};
|
||||
const formatTitle = (episodeInfo, mirror) => {
|
||||
const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
|
||||
if (mapping) {
|
||||
return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
|
||||
}
|
||||
return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
|
||||
};
|
||||
const getKitsuId = inputEpisode => {
|
||||
const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
|
||||
const episode = parseInt(episodeString, 10);
|
||||
if (kitsuIdsMapping[episode]) {
|
||||
return kitsuIdsMapping[episode].kitsuId;
|
||||
} else if (Array.isArray(kitsuId)) {
|
||||
console.warn(`Unmapped episode number for ${showData.title} - ${inputEpisode}`);
|
||||
return undefined;
|
||||
}
|
||||
return kitsuId;
|
||||
};
|
||||
|
||||
return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
|
||||
.map(episodeInfo => episodeInfo.mirrors
|
||||
.filter(mirror => mirror.magnetLink && mirror.magnetLink.length)
|
||||
.map(mirror => ({
|
||||
provider: NAME,
|
||||
...mirror,
|
||||
infoHash: decode(mirror.magnetLink).infoHash,
|
||||
trackers: decode(mirror.magnetLink).tr.join(','),
|
||||
title: formatTitle(episodeInfo, mirror),
|
||||
type: Type.ANIME,
|
||||
kitsuId: getKitsuId(episodeInfo.episode),
|
||||
uploadDate: episodeInfo.uploadDate,
|
||||
})))
|
||||
.reduce((a, b) => a.concat(b), [])
|
||||
.filter(torrent => torrent.kitsuId)
|
||||
.map(torrent => entryLimiter.schedule(() => processTorrentRecord(torrent, updateSeeders))))
|
||||
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
|
||||
}
|
||||
|
||||
async function processTorrentRecord(torrent, updateSeeders = true) {
|
||||
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
|
||||
|
||||
if (existingTorrent && existingTorrent.provider === NAME) {
|
||||
if (updateSeeders) {
|
||||
return updateCurrentSeeders(torrent).then(updatedSeeders => checkAndUpdateTorrent(updatedSeeders))
|
||||
}
|
||||
return Promise.resolve(torrent)
|
||||
}
|
||||
|
||||
return updateTorrentSize(torrent)
|
||||
.then(updated => updateCurrentSeeders(updated))
|
||||
.then(updated => createTorrentEntry(updated, true))
|
||||
.catch(error => console.warn(`Failed creating entry for ${torrent.title}:`, error));
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
175
scraper/scrapers/kickass/kickass_api.js
Normal file
175
scraper/scrapers/kickass/kickass_api.js
Normal file
@@ -0,0 +1,175 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
const Promises = require('../../lib/promises');
|
||||
|
||||
const defaultProxies = [
|
||||
'https://katcr.co'
|
||||
];
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'movies',
|
||||
TV: 'tv',
|
||||
ANIME: 'anime',
|
||||
APPS: 'applications',
|
||||
GAMES: 'games',
|
||||
MUSIC: 'music',
|
||||
BOOKS: 'books',
|
||||
PORN: 'xxx',
|
||||
OTHER: 'other',
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config)))
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => ({ torrentId, ...torrent }))
|
||||
.catch((err) => torrent(torrentId, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config)))
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/category/${category}/page/${page}`, config)))
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('Access Denied')) {
|
||||
console.log(`Access Denied: ${requestUrl}`);
|
||||
throw new Error(`Access Denied: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden') ||
|
||||
body.includes('Origin DNS error') ||
|
||||
!body.includes('Kickass Torrents</title>')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('.table > tbody > tr').each((i, element) => {
|
||||
const row = $(element);
|
||||
const magnetLink = row.find('a[title="Torrent magnet link"]').attr('href');
|
||||
torrents.push({
|
||||
name: row.find('a[class="torrents_table__torrent_title"]').first().children('b').text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
torrentId: row.find('a[class="torrents_table__torrent_title"]').first().attr('href').replace('/torrent/', ''),
|
||||
category: row.find('span[class="torrents_table__upload_info"]').first().children('a').first().attr('href')
|
||||
.match(/category\/([^\/]+)/)[1],
|
||||
seeders: parseInt(row.find('td[data-title="Seed"]').first().text()),
|
||||
leechers: parseInt(row.find('td[data-title="Leech"]').first().text()),
|
||||
size: parseSize(row.find('td[data-title="Size"]').first().text()),
|
||||
uploadDate: moment(row.find('td[data-title="Age"]').first().attr('title')).toDate()
|
||||
});
|
||||
});
|
||||
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const content = $('div[class="col"]').first();
|
||||
const info = content.find('div[class="torrent_stats"]').parent();
|
||||
const description = content.find('div[id="main"]');
|
||||
const magnetLink = info.find('a[title="Download verified Magnet"]').attr('href');
|
||||
const imdbIdMatch = description.html().match(/imdb\.com\/title\/(tt\d+)/i);
|
||||
|
||||
const torrent = {
|
||||
name: info.find('h1').first().text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
seeders: parseInt(info.find('span[class="torrent_stats__seed_count mr-2"]').first().text().match(/\d+/)[0], 10),
|
||||
leechers: parseInt(info.find('span[class="torrent_stats__leech_count mr-2"]').first().text().match(/\d+/)[0], 10),
|
||||
category: info.find('small').first().children('a').first().attr('href').match(/\/category\/([^\/]+)/)[1],
|
||||
languages: description.find('span:contains(\'Audio\')').next().children().eq(0).text(),
|
||||
size: parseSize(description.find('ul[class="file_list"]').first().find('li').first().contents().eq(2).text()
|
||||
.match(/\(Size: (.+)\)/)[1]),
|
||||
uploadDate: moment(info.find('time').first().text()).toDate(),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1],
|
||||
files: content.find('ul[class="file_list"]').first().find('li > ul > li[class="file_list__file"]')
|
||||
.map((i, elem) => $(elem))
|
||||
.map((i, ele) => ({
|
||||
fileIndex: i,
|
||||
name: ele.find('span > ul > li').contents().eq(1).text().trim().replace(/^.+\//g, ''),
|
||||
path: ele.find('span > ul > li').contents().eq(1).text().trim(),
|
||||
size: parseSize(ele.contents().eq(2).text())
|
||||
})).get()
|
||||
};
|
||||
if (torrent.files.length >= 50) {
|
||||
// a max of 50 files are displayed on the page
|
||||
delete torrent.files;
|
||||
}
|
||||
resolve(torrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseSize(sizeText) {
|
||||
if (!sizeText) {
|
||||
return undefined;
|
||||
}
|
||||
let scale = 1;
|
||||
if (sizeText.includes('GB')) {
|
||||
scale = 1024 * 1024 * 1024
|
||||
} else if (sizeText.includes('MB')) {
|
||||
scale = 1024 * 1024;
|
||||
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
|
||||
scale = 1024;
|
||||
}
|
||||
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
98
scraper/scrapers/kickass/kickass_dump_scraper.js
Normal file
98
scraper/scrapers/kickass/kickass_dump_scraper.js
Normal file
@@ -0,0 +1,98 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const LineByLineReader = require('line-by-line');
|
||||
const fs = require('fs');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'KickassTorrents';
|
||||
const CSV_FILE_PATH = '/tmp/kickass.csv';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||
|
||||
async function scrape() {
|
||||
console.log(`starting to scrape KAT dump: ${JSON.stringify(lastDump)}`);
|
||||
|
||||
let entriesProcessed = 0;
|
||||
const lr = new LineByLineReader(CSV_FILE_PATH);
|
||||
lr.on('line', (line) => {
|
||||
if (entriesProcessed % 1000 === 0) {
|
||||
console.log(`Processed ${entriesProcessed} entries`);
|
||||
}
|
||||
const row = line.match(/(?<=^|\|)(".*"|[^|]+)(?=\||$)/g);
|
||||
if (row.length !== 11) {
|
||||
console.log(`Invalid row: ${line}`);
|
||||
return;
|
||||
}
|
||||
const torrent = {
|
||||
infoHash: row[0].toLowerCase(),
|
||||
title: row[1]
|
||||
.replace(/^"|"$/g, '')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/&\w{2,6};/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim(),
|
||||
category: row[2],
|
||||
size: parseInt(row[5], 10),
|
||||
seeders: parseInt(row[8], 10),
|
||||
uploadDate: moment.unix(parseInt(row[10], 10)).toDate(),
|
||||
};
|
||||
|
||||
if (!limiter.empty()) {
|
||||
lr.pause()
|
||||
}
|
||||
|
||||
limiter.schedule(() => processTorrentRecord(torrent)
|
||||
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
|
||||
.then(() => limiter.empty())
|
||||
.then((empty) => empty && lr.resume())
|
||||
.then(() => entriesProcessed++);
|
||||
});
|
||||
lr.on('error', (err) => {
|
||||
console.log(err);
|
||||
});
|
||||
lr.on('end', () => {
|
||||
fs.unlink(CSV_FILE_PATH);
|
||||
console.log(`finished to scrape KAT dump: ${JSON.stringify(lastDump)}!`);
|
||||
});
|
||||
}
|
||||
|
||||
const categoryMapping = {
|
||||
"Movies": Type.MOVIE,
|
||||
"TV": Type.SERIES,
|
||||
"Anime": Type.ANIME
|
||||
};
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (!categoryMapping[record.category] || record.seeders === 0) {
|
||||
return createSkipTorrentEntry(record);
|
||||
}
|
||||
if (await getStoredTorrentEntry(record)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const torrentFound = await findTorrent(record).catch(() => undefined);
|
||||
|
||||
if (!torrentFound) {
|
||||
return createSkipTorrentEntry(record);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
provider: NAME,
|
||||
title: torrentFound.name,
|
||||
size: record.size,
|
||||
type: categoryMapping[record.category],
|
||||
imdbId: torrentFound.imdbId,
|
||||
uploadDate: record.uploadDate,
|
||||
seeders: torrentFound.seeders,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
async function findTorrent(record) {
|
||||
return Promise.reject("not found");
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
91
scraper/scrapers/kickass/kickass_scraper.js
Normal file
91
scraper/scrapers/kickass/kickass_scraper.js
Normal file
@@ -0,0 +1,91 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const kickass = require('./kickass_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'KickassTorrents';
|
||||
const UNTIL_PAGE = 10;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => kickass.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
kickass.Categories.MOVIE,
|
||||
kickass.Categories.TV,
|
||||
kickass.Categories.ANIME,
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return kickass.browse(({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
const torrentFound = await kickass.torrent(record.torrentId).catch(() => undefined);
|
||||
|
||||
if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: torrentFound.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: torrentFound.torrentId,
|
||||
title: torrentFound.name.replace(/\t|\s+/g, ' '),
|
||||
type: TYPE_MAPPING[torrentFound.category],
|
||||
size: torrentFound.size,
|
||||
seeders: torrentFound.seeders,
|
||||
uploadDate: torrentFound.uploadDate,
|
||||
imdbId: torrentFound.imdbId,
|
||||
languages: torrentFound.languages || undefined
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[kickass.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[kickass.Categories.TV] = Type.SERIES;
|
||||
mapping[kickass.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
141
scraper/scrapers/lapumia/lapumia_api.js
Normal file
141
scraper/scrapers/lapumia/lapumia_api.js
Normal file
@@ -0,0 +1,141 @@
|
||||
const needle = require("needle")
|
||||
const moment = require("moment")
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require('magnet-uri');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper')
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50
|
||||
|
||||
const baseUrl = 'https://lapumia.org';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: null,
|
||||
TV: 'series',
|
||||
ANIME: 'animes',
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = torrentId.split('?p=')[1];
|
||||
return singleRequest(`${baseUrl}/?p=${slug}`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => {
|
||||
console.warn(`Failed Lapumia ${slug} request: `, err);
|
||||
return torrent(torrentId, config, retries - 1)
|
||||
});
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
|
||||
|
||||
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
|
||||
.then(body => parseTableBody(body))
|
||||
.then(torrents => torrents.length === 10 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const requestUrl = category ? `${baseUrl}/${category}/page/${page}/` : `${baseUrl}/page/${page}/`
|
||||
|
||||
return singleRequest(requestUrl, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('div.post').each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("div > a").text(),
|
||||
torrentId: row.find("div > a").attr("href")
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const magnets = $('h2 > span')
|
||||
.filter((i, elem) => isPtDubbed($(elem).text())).parent()
|
||||
.map((i, elem) => $(elem).nextUntil('h2, hr'))
|
||||
.map((i, elem) => $(elem).find('a[href^="magnet"]'))
|
||||
.map((i, section) => $(section).attr("href")).get();
|
||||
const category = parseCategory($('div.category').html());
|
||||
const details = $('div.content')
|
||||
const torrents = magnets.map(magnetLink => ({
|
||||
title: sanitizePtName(escapeHTML(decode(magnetLink).name.replace(/\+/g, ' '))),
|
||||
originalName: sanitizePtOriginalName(details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue),
|
||||
year: details.find('b:contains(\'Ano de Lançamento:\')')[0].nextSibling.nodeValue.trim(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: category,
|
||||
uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', 'pt-br').format()),
|
||||
imdbId: $('.imdbRatingPlugin').attr('data-title') || null,
|
||||
languages: sanitizePtLanguages(details.find('b:contains(\'Idioma\')')[0].nextSibling.nodeValue)
|
||||
}))
|
||||
resolve(torrents.filter((x) => x));
|
||||
});
|
||||
}
|
||||
|
||||
function parseCategory(categorys) {
|
||||
const $ = cheerio.load(categorys)
|
||||
if ($('a:contains(\'Animes\')').text()) {
|
||||
return Categories.ANIME
|
||||
}
|
||||
if ($('a:contains(\'Series\')').text()) {
|
||||
return Categories.TV
|
||||
}
|
||||
return Categories.MOVIE
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
114
scraper/scrapers/lapumia/lapumia_scraper.js
Normal file
114
scraper/scrapers/lapumia/lapumia_scraper.js
Normal file
@@ -0,0 +1,114 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const lapumia = require("./lapumia_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
const { getImdbId } = require("../../lib/metadata");
|
||||
|
||||
const NAME = "Lapumia";
|
||||
const UNTIL_PAGE = 5;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => lapumia.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
lapumia.Categories.MOVIE
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return lapumia
|
||||
.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
return lapumia.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) {
|
||||
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
|
||||
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
imdbId: foundTorrent.imdbId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[lapumia.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[lapumia.Categories.TV] = Type.SERIES;
|
||||
mapping[lapumia.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (lapumia.Categories.TV === category) {
|
||||
return 5;
|
||||
}
|
||||
if (lapumia.Categories.ANIME === category) {
|
||||
return 2;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
73
scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js
Normal file
73
scraper/scrapers/nyaapantsu/nyaa_pantsu_api.js
Normal file
@@ -0,0 +1,73 @@
|
||||
const { pantsu } = require('nyaapi')
|
||||
|
||||
const Categories = {
|
||||
ANIME: {
|
||||
ALL: '3_',
|
||||
ENGLISH: '3_5',
|
||||
RAW: '3_6',
|
||||
MUSIC_VIDEO: '3_12',
|
||||
NON_ENGLISH: '3_13',
|
||||
},
|
||||
LIVE_ACTION: {
|
||||
ALL: '5_',
|
||||
ENGLISH: '5_9',
|
||||
RAW: '5_11',
|
||||
PROMOTIONAL_VIDEO: '5_10',
|
||||
NON_ENGLISH: '5_18',
|
||||
}
|
||||
}
|
||||
|
||||
function torrent(torrentId) {
|
||||
if (!torrentId) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return pantsu.infoRequest(torrentId)
|
||||
.then(result => parseTorrent(result))
|
||||
.catch(error => handleError(error, torrentId));
|
||||
}
|
||||
|
||||
function search(query) {
|
||||
return pantsu.search(query)
|
||||
.then(results => results.map(torrent => parseTorrent(torrent)))
|
||||
.catch(error => handleError(error, query));
|
||||
}
|
||||
|
||||
function browse(config = {}) {
|
||||
const page = config.page || 1;
|
||||
const category = config.category || Categories.ANIME.ENGLISH;
|
||||
|
||||
return pantsu.list(category, page)
|
||||
.then(results => results.map(torrent => parseTorrent(torrent)))
|
||||
.catch(error => handleError(error, category));
|
||||
}
|
||||
|
||||
function handleError(error, identifier) {
|
||||
if (error.statusCode && error.statusCode >= 400) {
|
||||
return Promise.reject(new Error(`${error.statusCode}: [${identifier}] failed retrieval on NyaaPantsu`));
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
|
||||
function parseTorrent(torrent) {
|
||||
return {
|
||||
title: torrent.name.replace(/\t|\s+/g, ' ').trim(),
|
||||
torrentId: torrent.id,
|
||||
infoHash: torrent.hash.trim().toLowerCase(),
|
||||
magnetLink: torrent.magnet,
|
||||
torrentLink: torrent.torrent,
|
||||
seeders: torrent.seeders,
|
||||
size: torrent.filesize,
|
||||
uploadDate: new Date(torrent.date),
|
||||
category: `${torrent.category}_${torrent.sub_category}`,
|
||||
languages: torrent.languages ? torrent.languages.join(',') : undefined,
|
||||
files: torrent.file_list && torrent.file_list.length ? torrent.file_list.map((file, fileId) => ({
|
||||
fileIndex: fileId,
|
||||
name: file.path.replace(/([^\/]+$)/, '$1'),
|
||||
path: file.path,
|
||||
size: file.filesize
|
||||
})) : undefined
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
97
scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js
Normal file
97
scraper/scrapers/nyaapantsu/nyaa_pantsu_scraper.js
Normal file
@@ -0,0 +1,97 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const pantsu = require('./nyaa_pantsu_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const Promises = require('../../lib/promises');
|
||||
const repository = require('../../lib/repository');
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'NyaaPantsu';
|
||||
const UNTIL_PAGE = 5
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
// const ids = ['1033095'];
|
||||
// return Promise.all(ids.map(id => limiter.schedule(() => pantsu.torrent(id)
|
||||
// .then(torrent => processTorrentRecord(torrent)))))
|
||||
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => pantsu.torrent(torrent.torrentId))
|
||||
.then(foundTorrent => {
|
||||
if (Number.isInteger(foundTorrent.seeders)) {
|
||||
return [foundTorrent];
|
||||
}
|
||||
return []
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
pantsu.Categories.ANIME.ENGLISH
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return pantsu.browse(({ page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)
|
||||
.catch(error => {
|
||||
console.warn(`Failed processing [${torrent.infoHash}] ${torrent.title} due: `, error);
|
||||
return Promise.resolve();
|
||||
})))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (!record || await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
if (!record.size) {
|
||||
await updateTorrentSize(record)
|
||||
}
|
||||
if (record.seeders === null || record.seeders === undefined) {
|
||||
await updateCurrentSeeders(record);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
torrentLink: record.torrentLink,
|
||||
provider: NAME,
|
||||
torrentId: record.torrentId,
|
||||
title: record.title,
|
||||
type: Type.ANIME,
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
languages: record.languages,
|
||||
files: record.files || undefined
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
80
scraper/scrapers/nyaasi/nyaa_si_api.js
Normal file
80
scraper/scrapers/nyaasi/nyaa_si_api.js
Normal file
@@ -0,0 +1,80 @@
|
||||
const { si } = require('nyaapi')
|
||||
|
||||
const Categories = {
|
||||
ANIME: {
|
||||
ALL: '1_0',
|
||||
MUSIC_VIDEO: '1_1',
|
||||
ENGLISH: '1_2',
|
||||
NON_ENGLISH: '1_3',
|
||||
RAW: '1_4'
|
||||
},
|
||||
LIVE_ACTION: {
|
||||
ALL: '4_0',
|
||||
ENGLISH: '4_1',
|
||||
PROMOTIONAL_VIDEO: '4_2',
|
||||
NON_ENGLISH: '4_3',
|
||||
RAW: '4_4'
|
||||
}
|
||||
}
|
||||
|
||||
function torrent(torrentId) {
|
||||
if (!torrentId) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return si.infoRequest(torrentId)
|
||||
.then(result => parseTorrent(result))
|
||||
.then(result => ({ ...result, torrentId }))
|
||||
.catch(error => {
|
||||
if (error.statusCode && error.statusCode === 404) {
|
||||
return Promise.reject(new Error(`404: [${torrentId}] not found on NyaaSi`));
|
||||
}
|
||||
return Promise.reject(error);
|
||||
});
|
||||
}
|
||||
|
||||
function search(query) {
|
||||
return si.search(query)
|
||||
.then(results => results.map(torrent => parseTorrent(torrent)));
|
||||
}
|
||||
|
||||
function browse(config = {}) {
|
||||
const page = config.page || 1;
|
||||
const category = config.category || Categories.ANIME.ENGLISH;
|
||||
const sort = config.sort || 'id'
|
||||
|
||||
return si.list(category, page, { sort })
|
||||
.then(response => response.results || [])
|
||||
.then(results => results.map(torrent => parseTorrent(torrent)));
|
||||
}
|
||||
|
||||
function parseTorrent(torrent) {
|
||||
return {
|
||||
title: torrent.name.replace(/\t|\s+/g, ' ').trim(),
|
||||
torrentId: torrent.id,
|
||||
infoHash: torrent.hash.trim().toLowerCase(),
|
||||
magnetLink: torrent.magnet,
|
||||
torrentLink: torrent.torrent,
|
||||
seeders: parseInt(torrent.seeders),
|
||||
size: parseSize(torrent.filesize),
|
||||
uploadDate: new Date(torrent.date),
|
||||
category: torrent.sub_category,
|
||||
}
|
||||
}
|
||||
|
||||
function parseSize(sizeText) {
|
||||
if (!sizeText) {
|
||||
return undefined;
|
||||
}
|
||||
let scale = 1;
|
||||
if (sizeText.includes('GiB')) {
|
||||
scale = 1024 * 1024 * 1024
|
||||
} else if (sizeText.includes('MiB')) {
|
||||
scale = 1024 * 1024;
|
||||
} else if (sizeText.includes('KiB') || sizeText.includes('kB')) {
|
||||
scale = 1024;
|
||||
}
|
||||
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
83
scraper/scrapers/nyaasi/nyaa_si_scraper.js
Normal file
83
scraper/scrapers/nyaasi/nyaa_si_scraper.js
Normal file
@@ -0,0 +1,83 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const nyaasi = require('./nyaa_si_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const Promises = require('../../lib/promises');
|
||||
const repository = require('../../lib/repository');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'NyaaSi';
|
||||
const UNTIL_PAGE = 10
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
// const ids = ['1292786'];
|
||||
// return Promise.all(ids.map(id => limiter.schedule(() => nyaasi.torrent(id)
|
||||
// .then(torrent => processTorrentRecord(torrent)))))
|
||||
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
// return limiter.schedule(() => nyaasi.torrent(torrent.torrentId))
|
||||
// .then(foundTorrent => Number.isInteger(foundTorrent.seeders) ? [foundTorrent] : []);
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
nyaasi.Categories.ANIME.ENGLISH
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return nyaasi.browse({ page, sort: 'id' })
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)
|
||||
.catch(error => {
|
||||
console.warn(`Failed processing [${torrent.infoHash}] ${torrent.title} due: `, error);
|
||||
return Promise.resolve();
|
||||
})))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (!record || await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
torrentLink: record.torrentLink,
|
||||
provider: NAME,
|
||||
torrentId: record.torrentId,
|
||||
title: record.title,
|
||||
type: Type.ANIME,
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
150
scraper/scrapers/ondebaixa/ondebaixa_api.js
Normal file
150
scraper/scrapers/ondebaixa/ondebaixa_api.js
Normal file
@@ -0,0 +1,150 @@
|
||||
const needle = require("needle")
|
||||
const cheerio = require("cheerio");
|
||||
const decode = require('magnet-uri');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper')
|
||||
|
||||
const defaultTimeout = 10000;
|
||||
const maxSearchPage = 50
|
||||
|
||||
const baseUrl = 'https://ondebaixa.com';
|
||||
|
||||
const Categories = {
|
||||
MOVIE: 'filmes',
|
||||
TV: 'series',
|
||||
ANIME: 'anime',
|
||||
DESENHOS: 'desenhos'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} query`));
|
||||
}
|
||||
const slug = encodeURIComponent(torrentId.split("/")[3]);
|
||||
return singleRequest(`${baseUrl}/${slug}/`, config)
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
|
||||
.catch((err) => {
|
||||
console.warn(`Failed OndeBaixo ${slug} request: `, err);
|
||||
return torrent(torrentId, config, retries - 1)
|
||||
});
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
|
||||
|
||||
return singleRequest(`${baseUrl}/${keyword}/${page}/`, config)
|
||||
.then(body => parseTableBody(body))
|
||||
.then(torrents => torrents.length === 40 && page < extendToPage
|
||||
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
|
||||
.then(nextTorrents => torrents.concat(nextTorrents))
|
||||
: torrents)
|
||||
.catch((err) => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
const requestUrl = category ? `${baseUrl}/${category}/${page}/` : `${baseUrl}/${page}/`;
|
||||
|
||||
return singleRequest(requestUrl, config)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('div.capa_larga.align-middle').each((i, element) => {
|
||||
const row = $(element);
|
||||
torrents.push({
|
||||
name: row.find("a").text(),
|
||||
torrentId: row.find("a").attr("href")
|
||||
});
|
||||
});
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const magnets = $(`a[href^="magnet"]`)
|
||||
.filter((i, elem) => isPtDubbed($(elem).attr('title')))
|
||||
.map((i, elem) => $(elem).attr("href")).get();
|
||||
const details = $('div#informacoes')
|
||||
const category = details.find('span:contains(\'Gêneros: \')').next().html()
|
||||
const torrents = magnets.map(magnetLink => {
|
||||
const decodedMagnet = decode(magnetLink);
|
||||
const name = escapeHTML(decodedMagnet.name || '').replace(/\+/g, ' ');
|
||||
const originalTitle = details.find('span:contains(\'Título Original: \')').next().text().trim();
|
||||
const year = details.find('span:contains(\'Ano de Lançamento: \')').next().text().trim();
|
||||
const fallbackTitle = `${originalTitle} ${year}`;
|
||||
return {
|
||||
title: name ? sanitizePtName(name) : fallbackTitle,
|
||||
originalName: sanitizePtOriginalName(originalTitle),
|
||||
year: year,
|
||||
infoHash: decodedMagnet.infoHash,
|
||||
magnetLink: magnetLink,
|
||||
category: parseCategory(category),
|
||||
uploadDate: new Date($('time').attr('datetime')),
|
||||
languages: sanitizePtLanguages(details.find('span:contains(\'Idioma\')').next().text())
|
||||
}
|
||||
});
|
||||
resolve(torrents.filter((x) => x));
|
||||
});
|
||||
}
|
||||
|
||||
function parseCategory(body) {
|
||||
const $ = cheerio.load(body)
|
||||
if ($("a[href*='anime']").text()) {
|
||||
return Categories.ANIME
|
||||
}
|
||||
if ($("a[href*='series']").text()) {
|
||||
return Categories.TV
|
||||
}
|
||||
if ($("a[href*='filmes']").text()) {
|
||||
return Categories.MOVIE
|
||||
}
|
||||
if ($("a[href*='desenhos']").text()) {
|
||||
return Categories.TV
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
119
scraper/scrapers/ondebaixa/ondebaixa_scraper.js
Normal file
119
scraper/scrapers/ondebaixa/ondebaixa_scraper.js
Normal file
@@ -0,0 +1,119 @@
|
||||
const moment = require("moment");
|
||||
const Bottleneck = require("bottleneck");
|
||||
const ondebaixa = require("./ondebaixa_api");
|
||||
const { Type } = require("../../lib/types");
|
||||
const repository = require("../../lib/repository");
|
||||
const Promises = require("../../lib/promises");
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
|
||||
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
|
||||
const { getImdbId } = require("../../lib/metadata");
|
||||
|
||||
const NAME = "OndeBaixa";
|
||||
const UNTIL_PAGE = 5;
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 5 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => ondebaixa.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
ondebaixa.Categories.MOVIE,
|
||||
ondebaixa.Categories.TV,
|
||||
ondebaixa.Categories.DESENHOS
|
||||
];
|
||||
|
||||
return Promises.sequence(
|
||||
allowedCategories.map(
|
||||
(category) => () => scrapeLatestTorrentsForCategory(category)
|
||||
)
|
||||
).then((entries) => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return ondebaixa
|
||||
.browse({ category, page })
|
||||
.catch((error) => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
|
||||
.then((resolved) => resolved.length > 0 && page < untilPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processEntry(entry) {
|
||||
return ondebaixa.torrent(entry.torrentId)
|
||||
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
|
||||
.catch(() => undefined);
|
||||
}
|
||||
|
||||
async function processTorrentRecord(foundTorrent) {
|
||||
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
|
||||
return foundTorrent;
|
||||
}
|
||||
|
||||
if (!foundTorrent.size) {
|
||||
await updateTorrentSize(foundTorrent);
|
||||
}
|
||||
if (!Number.isInteger(foundTorrent.seeders)) {
|
||||
await updateCurrentSeeders(foundTorrent);
|
||||
}
|
||||
if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) {
|
||||
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
|
||||
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: foundTorrent.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
imdbId: foundTorrent.imdbId,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
seeders: foundTorrent.seeders,
|
||||
size: foundTorrent.size,
|
||||
files: foundTorrent.files,
|
||||
languages: foundTorrent.languages
|
||||
};
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[ondebaixa.Categories.MOVIE] = Type.MOVIE;
|
||||
mapping[ondebaixa.Categories.TV] = Type.SERIES;
|
||||
mapping[ondebaixa.Categories.DESENHOS] = Type.SERIES;
|
||||
mapping[ondebaixa.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function untilPage(category) {
|
||||
if (ondebaixa.Categories.DESENHOS === category) {
|
||||
return 5;
|
||||
}
|
||||
if (ondebaixa.Categories.TV === category) {
|
||||
return 5;
|
||||
}
|
||||
return UNTIL_PAGE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
147
scraper/scrapers/rarbg/rarbg_api.js
Normal file
147
scraper/scrapers/rarbg/rarbg_api.js
Normal file
@@ -0,0 +1,147 @@
|
||||
const needle = require('needle');
|
||||
const decode = require('magnet-uri');
|
||||
const Promises = require('../../lib/promises');
|
||||
|
||||
const baseUrl = 'https://torrentapi.org/pubapi_v2.php';
|
||||
const appId = 'torrentio-addon';
|
||||
const defaultTimeout = 30000;
|
||||
|
||||
let token;
|
||||
|
||||
const Options = {
|
||||
category: {
|
||||
MOVIES_XVID: [14],
|
||||
MOVIES_XVID_720P: [48],
|
||||
MOVIES_X264: [17],
|
||||
MOVIES_X264_1080P: [44],
|
||||
MOVIES_X264_720P: [45],
|
||||
MOVIES_X264_3D: [47],
|
||||
MOVIES_X264_4K: [50],
|
||||
MOVIES_X265_1080P: [54],
|
||||
MOVIES_X265_4K: [51],
|
||||
MOVIES_X265_4K_HDR: [52],
|
||||
MOVIES_FULL_BD: [42],
|
||||
MOVIES_BD_REMUX: [46],
|
||||
TV_EPISODES: [18],
|
||||
TV_UHD_EPISODES: [49],
|
||||
TV_HD_EPISODES: [41],
|
||||
MUSIC_MP3: [23],
|
||||
MUSIC_FLAC: [25],
|
||||
GAMES_PC_ISO: [27],
|
||||
GAMES_PC_RIP: [28],
|
||||
GAMES_PS3: [40],
|
||||
GAMES_XBOX_360: [32],
|
||||
SOFTWARE_PC_ISO: [33],
|
||||
EBOOKS: [35],
|
||||
XXX: [4],
|
||||
},
|
||||
sort: {
|
||||
LAST: 'last',
|
||||
SEEDERS: 'seeders',
|
||||
LEECHERS: 'leechers'
|
||||
},
|
||||
format: {
|
||||
JSON: 'json',
|
||||
JSON_EXTENDED: 'json_extended'
|
||||
},
|
||||
ranked: {
|
||||
TRUE: 1,
|
||||
FALSE: 0
|
||||
}
|
||||
}
|
||||
|
||||
function search(imdbId, params = {}) {
|
||||
if (!imdbId) {
|
||||
return Promise.reject(new Error(`Must define imdbId`));
|
||||
}
|
||||
const parameters = {
|
||||
mode: 'search',
|
||||
search_imdb: imdbId,
|
||||
category: params.category && params.category.join(';') || null,
|
||||
limit: params.limit || 100,
|
||||
sort: params.sort || Options.sort.SEEDERS,
|
||||
min_seeders: params.min_seeders || undefined,
|
||||
min_leechers: params.min_leechers || undefined,
|
||||
format: params.format || Options.format.JSON_EXTENDED,
|
||||
ranked: params.ranked || Options.ranked.FALSE
|
||||
}
|
||||
|
||||
return singleRequest(parameters).then(results => parseResults(results));
|
||||
}
|
||||
|
||||
function browse(params = {}) {
|
||||
const parameters = {
|
||||
mode: 'list',
|
||||
category: params.category && params.category.join(';') || null,
|
||||
limit: params.limit || 100,
|
||||
sort: params.sort || Options.sort.LAST,
|
||||
min_seeders: params.min_seeders || undefined,
|
||||
min_leechers: params.min_leechers || undefined,
|
||||
format: params.format || Options.format.JSON_EXTENDED,
|
||||
ranked: params.ranked || Options.ranked.FALSE
|
||||
}
|
||||
|
||||
return singleRequest(parameters).then(results => parseResults(results));
|
||||
}
|
||||
|
||||
async function singleRequest(params = {}, config = {}, retries = 10) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { open_timeout: timeout, follow: 2 };
|
||||
params.token = await getToken();
|
||||
params.app_id = appId;
|
||||
|
||||
Object.keys(params)
|
||||
.filter(key => params[key] === undefined || params[key] === null)
|
||||
.forEach(key => delete params[key]);
|
||||
|
||||
return needle('get', baseUrl, params, options)
|
||||
.then(response => {
|
||||
if (response.body && response.body.error_code === 4) {
|
||||
// token expired
|
||||
token = undefined;
|
||||
return singleRequest(params, config);
|
||||
}
|
||||
if ((!response.body || !response.body.length || [5, 20].includes(response.body.error_code)) && retries > 0) {
|
||||
// too many requests
|
||||
return Promises.delay(3000).then(() => singleRequest(params, config, retries - 1));
|
||||
}
|
||||
if (response.statusCode !== 200 || (response.body && response.body.error)) {
|
||||
// something went wrong
|
||||
return Promise.reject(response.body || `Failed RARGB request with status=${response.statusCode}`);
|
||||
}
|
||||
|
||||
return response.body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseResults(results) {
|
||||
if (!results || !Array.isArray(results.torrent_results)) {
|
||||
return Promise.reject(`Incorrect results ${JSON.stringify(results)}`)
|
||||
}
|
||||
return results.torrent_results.map(result => parseResult(result));
|
||||
}
|
||||
|
||||
function parseResult(result) {
|
||||
return {
|
||||
title: result.title,
|
||||
infoHash: decode(result.download).infoHash,
|
||||
magnetLink: result.download,
|
||||
seeders: result.seeders,
|
||||
leechers: result.leechers,
|
||||
category: result.category,
|
||||
size: result.size,
|
||||
uploadDate: new Date(result.pubdate),
|
||||
imdbId: result.episode_info && result.episode_info.imdb
|
||||
}
|
||||
}
|
||||
|
||||
async function getToken() {
|
||||
if (!token) {
|
||||
const options = { open_timeout: defaultTimeout };
|
||||
token = await needle('get', baseUrl, { get_token: 'get_token', app_id: appId }, options)
|
||||
.then(response => response.body.token);
|
||||
}
|
||||
return token;
|
||||
}
|
||||
|
||||
module.exports = { search, browse, Options };
|
||||
84
scraper/scrapers/rarbg/rarbg_dump_scraper.js
Normal file
84
scraper/scrapers/rarbg/rarbg_dump_scraper.js
Normal file
@@ -0,0 +1,84 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const rarbg = require('./rarbg_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'RARBG';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 1, minTime: 3000 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
|
||||
const allowedCategories = [
|
||||
rarbg.Options.category.MOVIES_XVID,
|
||||
rarbg.Options.category.MOVIES_XVID_720P,
|
||||
rarbg.Options.category.MOVIES_X265_1080P,
|
||||
rarbg.Options.category.MOVIES_X265_4K,
|
||||
rarbg.Options.category.MOVIES_X265_4K_HDR,
|
||||
rarbg.Options.category.MOVIES_X264,
|
||||
rarbg.Options.category.MOVIES_X264_720P,
|
||||
rarbg.Options.category.MOVIES_X264_1080P,
|
||||
rarbg.Options.category.MOVIES_X264_3D,
|
||||
rarbg.Options.category.MOVIES_X264_4K,
|
||||
rarbg.Options.category.MOVIES_BD_REMUX,
|
||||
rarbg.Options.category.TV_EPISODES,
|
||||
rarbg.Options.category.TV_UHD_EPISODES,
|
||||
rarbg.Options.category.TV_HD_EPISODES
|
||||
].reduce((a, b) => a.concat(b), [])
|
||||
|
||||
async function scrape() {
|
||||
console.log(`[${moment()}] starting ${NAME} dump scrape...`);
|
||||
// const movieImdbIds = require('./rargb_movie_imdb_ids_2021-02-27.json');
|
||||
const seriesImdbIds = require('./rargb_series_imdb_ids_2021-02-27.json');
|
||||
//const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
|
||||
|
||||
return Promise.all(
|
||||
seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId))
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`));
|
||||
}
|
||||
|
||||
async function getTorrentsForImdbId(imdbId) {
|
||||
return rarbg.search(imdbId, { category: allowedCategories })
|
||||
.then(torrents => {
|
||||
console.log(`Completed ${imdbId} request`);
|
||||
return torrents;
|
||||
})
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} request for ${imdbId}: `, error);
|
||||
return [];
|
||||
});
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
provider: NAME,
|
||||
infoHash: record.infoHash,
|
||||
title: record.title,
|
||||
type: getType(record.category),
|
||||
seeders: record.seeders,
|
||||
size: record.size,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: record.imdbId
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
const seriesCategories = [
|
||||
'TV Episodes',
|
||||
'Movies/TV-UHD-episodes',
|
||||
'TV HD Episodes',
|
||||
];
|
||||
|
||||
function getType(category) {
|
||||
if (seriesCategories.includes(category)) {
|
||||
return Type.SERIES;
|
||||
}
|
||||
return Type.MOVIE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
99
scraper/scrapers/rarbg/rarbg_scraper.js
Normal file
99
scraper/scrapers/rarbg/rarbg_scraper.js
Normal file
@@ -0,0 +1,99 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const rarbg = require('./rarbg_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'RARBG';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
|
||||
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent, getImdbIdsMethod) {
|
||||
// return getImdbIdsMethod()
|
||||
// .then(imdbIds => Promise.all(imdbIds.map(imdbId => limiter.schedule(() => search(imdbId)))))
|
||||
// .then(results => results.reduce((a, b) => a.concat(b), []));
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
const allowedCategories = [
|
||||
rarbg.Options.category.MOVIES_XVID,
|
||||
rarbg.Options.category.MOVIES_XVID_720P,
|
||||
rarbg.Options.category.MOVIES_X265_1080P,
|
||||
rarbg.Options.category.MOVIES_X265_4K,
|
||||
rarbg.Options.category.MOVIES_X265_4K_HDR,
|
||||
rarbg.Options.category.MOVIES_X264,
|
||||
rarbg.Options.category.MOVIES_X264_720P,
|
||||
rarbg.Options.category.MOVIES_X264_1080P,
|
||||
rarbg.Options.category.MOVIES_X264_3D,
|
||||
rarbg.Options.category.MOVIES_X264_4K,
|
||||
rarbg.Options.category.MOVIES_BD_REMUX,
|
||||
rarbg.Options.category.TV_EPISODES,
|
||||
rarbg.Options.category.TV_UHD_EPISODES,
|
||||
rarbg.Options.category.TV_HD_EPISODES
|
||||
];
|
||||
|
||||
return Promises.sequence(allowedCategories
|
||||
.map(category => () => limiter.schedule(() => scrapeLatestTorrentsForCategory(category))))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category) {
|
||||
console.log(`Scrapping ${NAME} ${category} category`);
|
||||
return rarbg.browse({ category: category })
|
||||
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
});
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
provider: NAME,
|
||||
infoHash: record.infoHash,
|
||||
title: record.title,
|
||||
type: getType(record.category),
|
||||
seeders: record.seeders,
|
||||
size: record.size,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: record.imdbId
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
const seriesCategories = [
|
||||
'TV Episodes',
|
||||
'Movies/TV-UHD-episodes',
|
||||
'TV HD Episodes',
|
||||
];
|
||||
|
||||
function getType(category) {
|
||||
if (seriesCategories.includes(category)) {
|
||||
return Type.SERIES;
|
||||
}
|
||||
return Type.MOVIE;
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
42378
scraper/scrapers/rarbg/rargb_movie_imdb_ids_2020-04-22.json
Normal file
42378
scraper/scrapers/rarbg/rargb_movie_imdb_ids_2020-04-22.json
Normal file
File diff suppressed because it is too large
Load Diff
6305
scraper/scrapers/rarbg/rargb_series_imdb_ids_2020-04-22.json
Normal file
6305
scraper/scrapers/rarbg/rargb_series_imdb_ids_2020-04-22.json
Normal file
File diff suppressed because it is too large
Load Diff
206
scraper/scrapers/rutor/rutor_api.js
Normal file
206
scraper/scrapers/rutor/rutor_api.js
Normal file
@@ -0,0 +1,206 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
const { defaultOptionsWithProxy } = require('../../lib/requestHelper');
|
||||
|
||||
const baseUrl = 'http://www.rutor.info';
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
const Categories = {
|
||||
FOREIGN_FILMS: '1',
|
||||
RUSSIAN_FILMS: '5',
|
||||
SCIENCE_FILMS: '12',
|
||||
FOREIGN_SERIES: '4',
|
||||
RUSSIAN_SERIES: '16',
|
||||
RUSSIAN_TV: '6',
|
||||
RUSSIAN_ANIMATION: '7',
|
||||
ANIME: '10',
|
||||
FOREIGN_RELEASES: '17'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2, error = null) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return singleRequest(`${baseUrl}/torrent/${torrentId}`)
|
||||
.then((body) => parseTorrentPage(body, torrentId))
|
||||
.catch((err) => torrent(torrentId, config, retries - 1, err));
|
||||
}
|
||||
|
||||
function search(query, retries = 2, error = null) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed browse request`));
|
||||
}
|
||||
|
||||
return singleRequest(`${baseUrl}/search/0/0/0/0/${encodeURIComponent(query)}`)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => search(query, retries - 1, err));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2, error = null) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`)
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1, err));
|
||||
}
|
||||
|
||||
function files(torrentId) {
|
||||
return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`)
|
||||
.then((body) => parseFiles(body));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl) {
|
||||
const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
|
||||
} else if (body.includes('Access Denied')) {
|
||||
console.log(`Access Denied: ${requestUrl}`);
|
||||
throw new Error(`Access Denied: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden') ||
|
||||
body.includes('Origin DNS error')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => {
|
||||
const row = $(elem).find('td');
|
||||
const links = $(row[1]).find('a');
|
||||
const peers = $(row[row.length - 1]);
|
||||
const magnetLink = $(links[1]).attr('href');
|
||||
|
||||
return {
|
||||
title: $(links[2]).text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
torrentLink: $(links[0]).attr('href'),
|
||||
torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1],
|
||||
seeders: parseInt(peers.find('.green').text()),
|
||||
leechers: parseInt(peers.find('.red').text()),
|
||||
uploadDate: parseRussianDate($(row[0]).text()),
|
||||
size: $(row[row.length - 2]).html().replace(' ', ' '),
|
||||
}
|
||||
}).get();
|
||||
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body, torrentId) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const rows = $('#details > tr')
|
||||
const details = $(rows[0]).find('td:nth-of-type(2)');
|
||||
const magnetLink = $('#download a:nth-of-type(1)').attr('href');
|
||||
const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i);
|
||||
|
||||
const parsedTorrent = {
|
||||
title: $('#all h1').first().text(),
|
||||
torrentId: torrentId,
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
trackers: Array.from(new Set(decode(magnetLink).tr)).join(','),
|
||||
magnetLink: magnetLink,
|
||||
torrentLink: $('#download a:nth-of-type(2)').attr('href'),
|
||||
seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10),
|
||||
category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1],
|
||||
languages: parseLanguages(details.text()),
|
||||
size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()),
|
||||
uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1]
|
||||
};
|
||||
resolve(parsedTorrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseFiles(body) {
|
||||
if (!body) {
|
||||
throw new Error("No files in the body");
|
||||
}
|
||||
return body.split('\n')
|
||||
.map((item) => item.match(/<td>([^<]+)<\/td>/g).slice(1))
|
||||
.map((item, index) => ({
|
||||
fileIndex: index,
|
||||
name: item[0].replace(/^.+\//g, ''),
|
||||
path: item[0].replace(/^.+\//, ''),
|
||||
size: parseSize(item[1])
|
||||
}));
|
||||
}
|
||||
|
||||
function parseDate(dateString) {
|
||||
const preparedDate = dateString.replace(/\s\(.*\)/, '')
|
||||
return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate();
|
||||
}
|
||||
|
||||
const russianMonths = {
|
||||
'Янв': 'Jan',
|
||||
'Фев': 'Feb',
|
||||
'Мар': 'Mar',
|
||||
'Апр': 'Apr',
|
||||
'Май': 'May',
|
||||
'Июн': 'Jun',
|
||||
'Июл': 'Jul',
|
||||
'Авг': 'Aug',
|
||||
'Сен': 'Sep',
|
||||
'Окт': 'Oct',
|
||||
'Ноя': 'Nov',
|
||||
'Дек': 'Dec'
|
||||
};
|
||||
|
||||
function parseRussianDate(dateString) {
|
||||
const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month));
|
||||
const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' ');
|
||||
return moment(preparedDate, 'DD MMM YY').toDate();
|
||||
}
|
||||
|
||||
function parseSize(sizeString) {
|
||||
return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10);
|
||||
}
|
||||
|
||||
const languageMatchers = {
|
||||
'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i,
|
||||
'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i,
|
||||
'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i,
|
||||
'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i,
|
||||
'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i,
|
||||
'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i,
|
||||
'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i,
|
||||
'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i,
|
||||
'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i,
|
||||
'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i,
|
||||
'japanese': /(?:Язык|Звук|Аудио|audio|language).*(japanese|\bjap\b|\bjp\b|[Яя]понский)/i,
|
||||
}
|
||||
|
||||
function parseLanguages(details) {
|
||||
const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i);
|
||||
const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details;
|
||||
const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart));
|
||||
const languages = Array.from(new Set(['russian'].concat(matchedLanguages)));
|
||||
return languages.length > 4 ? 'multi-audio' : languages.join(',');
|
||||
}
|
||||
|
||||
module.exports = { torrent, browse, search, Categories };
|
||||
133
scraper/scrapers/rutor/rutor_scraper.js
Normal file
133
scraper/scrapers/rutor/rutor_scraper.js
Normal file
@@ -0,0 +1,133 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { parse } = require('parse-torrent-title');
|
||||
const rutor = require('./rutor_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'Rutor';
|
||||
const TYPE_MAPPING = {
|
||||
'kino': Type.MOVIE,
|
||||
'nashe_kino': Type.MOVIE,
|
||||
'nauchno_popularnoe': Type.MOVIE,
|
||||
'inostrannoe': Type.MOVIE,
|
||||
'seriali': Type.SERIES,
|
||||
'nashi_seriali': Type.SERIES,
|
||||
'tv': Type.SERIES,
|
||||
'multiki': Type.MOVIE,
|
||||
'anime': Type.ANIME
|
||||
};
|
||||
|
||||
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
|
||||
const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
const allowedCategories = [
|
||||
rutor.Categories.FOREIGN_FILMS,
|
||||
rutor.Categories.FOREIGN_RELEASES,
|
||||
rutor.Categories.RUSSIAN_FILMS,
|
||||
rutor.Categories.FOREIGN_SERIES,
|
||||
rutor.Categories.RUSSIAN_SERIES,
|
||||
rutor.Categories.SCIENCE_FILMS,
|
||||
rutor.Categories.RUSSIAN_ANIMATION,
|
||||
rutor.Categories.ANIME
|
||||
];
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
// const ids = [
|
||||
// '637799'
|
||||
// ];
|
||||
// return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id))
|
||||
// .then(torrent => processTorrentRecord(torrent))))
|
||||
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return api_limiter.schedule(() => rutor.browse({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month'));
|
||||
if (record.seeders === 0 && isOld) {
|
||||
console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`)
|
||||
return record;
|
||||
}
|
||||
|
||||
const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined));
|
||||
|
||||
if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) {
|
||||
return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`);
|
||||
}
|
||||
if (!foundTorrent.imdbId && disallowWithoutImdbId(foundTorrent)) {
|
||||
return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
provider: NAME,
|
||||
infoHash: foundTorrent.infoHash,
|
||||
torrentId: foundTorrent.torrentId,
|
||||
torrentLink: foundTorrent.torrentLink,
|
||||
trackers: foundTorrent.trackers,
|
||||
title: foundTorrent.title,
|
||||
type: TYPE_MAPPING[foundTorrent.category],
|
||||
size: foundTorrent.size,
|
||||
seeders: foundTorrent.seeders,
|
||||
uploadDate: foundTorrent.uploadDate,
|
||||
imdbId: foundTorrent.imdbId,
|
||||
languages: foundTorrent.languages || undefined,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
function getMaxPage(category) {
|
||||
switch (category) {
|
||||
case rutor.Categories.FOREIGN_FILMS:
|
||||
case rutor.Categories.FOREIGN_SERIES:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
function disallowWithoutImdbId(torrent) {
|
||||
if (['kino', 'anime'].includes(torrent.category)) {
|
||||
return false; // allow to search foreign movie and anime ids via search
|
||||
}
|
||||
// allow to search id for non russian series titles via search
|
||||
return !(torrent.category === 'seriali' && !parse(torrent.title).title.match(/[\u0400-\u04ff]/i));
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
26
scraper/scrapers/scraperHelper.js
Normal file
26
scraper/scrapers/scraperHelper.js
Normal file
@@ -0,0 +1,26 @@
|
||||
function isPtDubbed(name) {
|
||||
return name.toLowerCase().match(/dublado|dual|nacional|multi/);
|
||||
}
|
||||
|
||||
function sanitizePtName(name) {
|
||||
return name
|
||||
.replace(/(.*)\b(\d{3,4}P)\b(?!.*\d{3,4}[Pp])(.*)/, '$1$3 $2') // add resolution to the end if missing
|
||||
.replace(/^[\[{]?(?:ACESSE.*|WWW\.)?[A-Z]+\.(COM|NET|ORG|TO|TV|ME)\b\s*[-\]}]+[\s.]*/i, '') // replace watermarks
|
||||
.replace(/^(\d*(?:\.\d{1,2})?(?:[4A-Z-]{3,}|P)[-.]+)+/, '') // replace metadata prefixes
|
||||
.replace(/^[\[{]?(?:ACESSE.*|WWW\.)?[A-Z]+\.(COM|NET|ORG|TO|TV|ME)\b\s*[-\]}]+[\s.]*/i, '') // replace watermarks2
|
||||
.replace(/^(COM|NET|ORG|TO|TV|ME)\b\s*-+[\s.]*/, '') // replace dangling site endings
|
||||
.trim();
|
||||
}
|
||||
|
||||
function sanitizePtOriginalName(name) {
|
||||
return name.trim().replace(/S\d+$|\d.\s?[Tt]emporada/, '');
|
||||
}
|
||||
|
||||
function sanitizePtLanguages(languages) {
|
||||
return languages
|
||||
.replace(/<2F><>/g, 'ê')
|
||||
.replace(/ /g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
module.exports = { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages }
|
||||
129
scraper/scrapers/thepiratebay/thepiratebay_api.js
Normal file
129
scraper/scrapers/thepiratebay/thepiratebay_api.js
Normal file
@@ -0,0 +1,129 @@
|
||||
const needle = require('needle');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
|
||||
const baseUrl = 'https://apibay.org';
|
||||
const timeout = 5000;
|
||||
|
||||
const Categories = {
|
||||
AUDIO: {
|
||||
ALL: 100,
|
||||
MUSIC: 101,
|
||||
AUDIO_BOOKS: 102,
|
||||
SOUND_CLIPS: 103,
|
||||
FLAC: 104,
|
||||
OTHER: 199
|
||||
},
|
||||
VIDEO: {
|
||||
ALL: 200,
|
||||
MOVIES: 201,
|
||||
MOVIES_DVDR: 202,
|
||||
MUSIC_VIDEOS: 203,
|
||||
MOVIE_CLIPS: 204,
|
||||
TV_SHOWS: 205,
|
||||
HANDHELD: 206,
|
||||
MOVIES_HD: 207,
|
||||
TV_SHOWS_HD: 208,
|
||||
MOVIES_3D: 209,
|
||||
OTHER: 299
|
||||
},
|
||||
APPS: {
|
||||
ALL: 300,
|
||||
WINDOWS: 301,
|
||||
MAC: 302,
|
||||
UNIX: 303,
|
||||
HANDHELD: 304,
|
||||
IOS: 305,
|
||||
ANDROID: 306,
|
||||
OTHER_OS: 399
|
||||
},
|
||||
GAMES: {
|
||||
ALL: 400,
|
||||
PC: 401,
|
||||
MAC: 402,
|
||||
PSx: 403,
|
||||
XBOX360: 404,
|
||||
Wii: 405,
|
||||
HANDHELD: 406,
|
||||
IOS: 407,
|
||||
ANDROID: 408,
|
||||
OTHER: 499
|
||||
},
|
||||
PORN: {
|
||||
ALL: 500,
|
||||
MOVIES: 501,
|
||||
MOVIES_DVDR: 502,
|
||||
PICTURES: 503,
|
||||
GAMES: 504,
|
||||
MOVIES_HD: 505,
|
||||
MOVIE_CLIPS: 506,
|
||||
OTHER: 599
|
||||
},
|
||||
OTHER: {
|
||||
ALL: 600,
|
||||
E_BOOKS: 601,
|
||||
COMICS: 602,
|
||||
PICTURES: 603,
|
||||
COVERS: 604,
|
||||
PHYSIBLES: 605,
|
||||
OTHER: 699
|
||||
}
|
||||
};
|
||||
|
||||
function torrent(torrentId, retries = 2) {
|
||||
if (!torrentId) {
|
||||
return Promise.reject(new Error('No valid torrentId provided'));
|
||||
}
|
||||
|
||||
return _request(`t.php?id=${torrentId}`)
|
||||
.then(result => toTorrent(result))
|
||||
.catch(error => retries ? torrent(torrentId, retries - 1) : Promise.reject(error));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword) {
|
||||
return Promise.reject(new Error('No valid keyword provided'));
|
||||
}
|
||||
const q = keyword;
|
||||
const cat = config.category || Categories.VIDEO.ALL;
|
||||
|
||||
return _request(`q.php?q=${q}&cat=${cat}`)
|
||||
.then(results => results.map((result) => toTorrent(result)))
|
||||
.catch(error => retries ? search(keyword, config, retries - 1) : Promise.reject(error));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
const category = config.category || 0;
|
||||
const page = config.page - 1 || 0;
|
||||
|
||||
return _request(`q.php?q=category:${category}:${page}`)
|
||||
.then(results => results.map((result) => toTorrent(result)))
|
||||
.catch(error => retries ? browse(config, retries - 1) : Promise.reject(error));
|
||||
}
|
||||
|
||||
async function _request(endpoint) {
|
||||
const url = `${baseUrl}/${endpoint}`;
|
||||
return needle('get', url, { open_timeout: timeout })
|
||||
.then(response => {
|
||||
if (typeof response.body === 'object') {
|
||||
return response.body;
|
||||
}
|
||||
return Promise.reject(`Unexpected response body`);
|
||||
});
|
||||
}
|
||||
|
||||
function toTorrent(result) {
|
||||
return {
|
||||
torrentId: result.id,
|
||||
name: escapeHTML(result.name),
|
||||
infoHash: result.info_hash.toLowerCase(),
|
||||
size: parseInt(result.size),
|
||||
seeders: parseInt(result.seeders),
|
||||
leechers: parseInt(result.leechers),
|
||||
subcategory: parseInt(result.category),
|
||||
uploadDate: new Date(result.added * 1000),
|
||||
imdbId: result.imdb || undefined,
|
||||
filesCount: result.num_files && parseInt(result.num_files) || undefined
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
175
scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js
Normal file
175
scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js
Normal file
@@ -0,0 +1,175 @@
|
||||
const moment = require('moment');
|
||||
const needle = require('needle');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { ungzip } = require('node-gzip');
|
||||
const LineByLineReader = require('line-by-line');
|
||||
const fs = require('fs');
|
||||
const thepiratebay = require('./thepiratebay_api.js');
|
||||
const bing = require('nodejs-bing');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { escapeHTML } = require('../../lib/metadata');
|
||||
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'ThePirateBay';
|
||||
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||
|
||||
async function scrape() {
|
||||
const lastDump = { updatedAt: 2147000000 };
|
||||
//const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
||||
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
|
||||
const checkPoint = 0;
|
||||
|
||||
if (lastDump) {
|
||||
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
|
||||
await downloadDump(lastDump);
|
||||
|
||||
let entriesProcessed = 0;
|
||||
const lr = new LineByLineReader(CSV_FILE_PATH);
|
||||
lr.on('line', (line) => {
|
||||
if (line.includes("#ADDED")) {
|
||||
return;
|
||||
}
|
||||
if (entriesProcessed % 1000 === 0) {
|
||||
console.log(`Processed ${entriesProcessed} entries`);
|
||||
}
|
||||
if (entriesProcessed <= checkPoint) {
|
||||
entriesProcessed++;
|
||||
return;
|
||||
}
|
||||
|
||||
const row = line.match(/(?<=^|;)(".*"|[^;]+)(?=;|$)/g);
|
||||
if (row.length !== 4) {
|
||||
console.log(`Invalid row: ${line}`);
|
||||
return;
|
||||
}
|
||||
const torrent = {
|
||||
uploadDate: moment(row[0], 'YYYY-MMM-DD HH:mm:ss').toDate(),
|
||||
infoHash: Buffer.from(row[1], 'base64').toString('hex'),
|
||||
title: escapeHTML(row[2])
|
||||
.replace(/^"|"$/g, '')
|
||||
.replace(/&#?\w{2,6};/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim(),
|
||||
size: parseInt(row[3], 10)
|
||||
};
|
||||
|
||||
if (!limiter.empty()) {
|
||||
lr.pause()
|
||||
}
|
||||
|
||||
limiter.schedule(() => processTorrentRecord(torrent)
|
||||
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
|
||||
.then(() => limiter.empty())
|
||||
.then((empty) => empty && lr.resume())
|
||||
.then(() => entriesProcessed++);
|
||||
});
|
||||
lr.on('error', (err) => {
|
||||
console.log(err);
|
||||
});
|
||||
lr.on('end', () => {
|
||||
console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const allowedCategories = [
|
||||
thepiratebay.Categories.VIDEO.MOVIES,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_HD,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_3D,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
const seriesCategories = [
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await getStoredTorrentEntry(record)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const torrentFound = await findTorrent(record);
|
||||
|
||||
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
|
||||
return createSkipTorrentEntry(record);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: torrentFound.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: torrentFound.torrentId,
|
||||
title: torrentFound.name,
|
||||
size: torrentFound.size,
|
||||
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
|
||||
imdbId: seriesCategories.includes(torrentFound.subcategory) && torrentFound.imdbId || undefined,
|
||||
uploadDate: torrentFound.uploadDate || record.uploadDate,
|
||||
seeders: torrentFound.seeders,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
async function findTorrent(record) {
|
||||
return findTorrentInSource(record)
|
||||
.catch(() => findTorrentViaBing(record));
|
||||
}
|
||||
|
||||
async function findTorrentInSource(record) {
|
||||
let page = 0;
|
||||
let torrentFound;
|
||||
while (!torrentFound && page < 5) {
|
||||
const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
|
||||
torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0];
|
||||
page = torrents.length === 0 ? 1000 : page + 1;
|
||||
}
|
||||
if (!torrentFound) {
|
||||
return Promise.reject(new Error(`Failed to find torrent ${record.title}`));
|
||||
}
|
||||
return Promise.resolve(torrentFound)
|
||||
.then((torrent) => thepiratebay.torrent(torrent.torrentId)
|
||||
.catch(() => thepiratebay.torrent(torrent.torrentId)));
|
||||
}
|
||||
|
||||
async function findTorrentViaBing(record) {
|
||||
return bing.web(`${record.infoHash}`)
|
||||
.then((results) => results
|
||||
.find(result => result.description.includes('Direct download via magnet link') ||
|
||||
result.description.includes('Get this torrent')))
|
||||
.then((result) => {
|
||||
if (!result) {
|
||||
console.warn(`Failed to find torrent ${record.title}`);
|
||||
return Promise.resolve(undefined);
|
||||
}
|
||||
return result.link.match(/torrent\/(\w+)\//)[1];
|
||||
})
|
||||
.then((torrentId) => torrentId && thepiratebay.torrent(torrentId))
|
||||
}
|
||||
|
||||
function downloadDump(dump) {
|
||||
try {
|
||||
if (fs.existsSync(CSV_FILE_PATH)) {
|
||||
console.log('dump file already exist...');
|
||||
return;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
}
|
||||
|
||||
console.log('downloading dump file...');
|
||||
return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' })
|
||||
.then((response) => response.body)
|
||||
.then((body) => {
|
||||
console.log('unzipping dump file...');
|
||||
return ungzip(body);
|
||||
})
|
||||
.then((unzipped) => {
|
||||
console.log('writing dump file...');
|
||||
return fs.promises.writeFile(CSV_FILE_PATH, unzipped);
|
||||
})
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
43
scraper/scrapers/thepiratebay/thepiratebay_fakes_removal.js
Normal file
43
scraper/scrapers/thepiratebay/thepiratebay_fakes_removal.js
Normal file
@@ -0,0 +1,43 @@
|
||||
const moment = require('moment');
|
||||
const { Sequelize } = require('sequelize');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const thepiratebay = require('./thepiratebay_api.js');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
|
||||
const NAME = 'ThePirateBay';
|
||||
const EMPTY_HASH = '0000000000000000000000000000000000000000';
|
||||
|
||||
const Op = Sequelize.Op;
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
console.log(`Starting ${NAME} fake removal...`);
|
||||
const startCreatedAt = moment().subtract(14, 'day');
|
||||
const endCreatedAt = moment();
|
||||
const whereQuery = {
|
||||
provider: NAME,
|
||||
type: Type.MOVIE,
|
||||
createdAt: { [Op.between]: [startCreatedAt, endCreatedAt] }
|
||||
};
|
||||
return repository.getTorrentsBasedOnQuery(whereQuery)
|
||||
.then(torrents => {
|
||||
console.log(`Checking for ${NAME} fake entries in ${torrents.length} torrents`);
|
||||
return Promise.all(torrents.map(torrent => limiter.schedule(() => removeIfFake(torrent))))
|
||||
})
|
||||
.then(results => {
|
||||
const removed = results.filter(result => result);
|
||||
console.log(`Finished ${NAME} fake removal with ${removed.length} removals in ${results.length} torrents`);
|
||||
});
|
||||
}
|
||||
|
||||
async function removeIfFake(torrent) {
|
||||
const tpbTorrentInfo = await thepiratebay.torrent(torrent.torrentId).catch(() => null);
|
||||
if (tpbTorrentInfo && tpbTorrentInfo.infoHash === EMPTY_HASH) {
|
||||
console.log(`Removing ${NAME} fake torrent [${torrent.torrentId}][${torrent.infoHash}] ${torrent.title}`);
|
||||
return repository.deleteTorrent(torrent).catch(() => null);
|
||||
}
|
||||
return Promise.resolve(null);
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
87
scraper/scrapers/thepiratebay/thepiratebay_scraper.js
Normal file
87
scraper/scrapers/thepiratebay/thepiratebay_scraper.js
Normal file
@@ -0,0 +1,87 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const thepiratebay = require('./thepiratebay_api.js');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'ThePirateBay';
|
||||
const UNTIL_PAGE = 5;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
const allowedCategories = [
|
||||
thepiratebay.Categories.VIDEO.MOVIES,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_HD,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_3D,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
const seriesCategories = [
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
// return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId));
|
||||
return Promise.resolve([]);
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return thepiratebay.browse({ category, page })
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
if (!record || !allowedCategories.includes(record.subcategory)) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: record.torrentId,
|
||||
title: record.name.replace(/\t|\s+/g, ' '),
|
||||
type: seriesCategories.includes(record.subcategory) ? Type.SERIES : Type.MOVIE,
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: seriesCategories.includes(record.subcategory) && record.imdbId || undefined,
|
||||
languages: record.languages && record.languages.trim() || undefined
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
@@ -0,0 +1,112 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const LineByLineReader = require('line-by-line');
|
||||
const decode = require('magnet-uri');
|
||||
const thepiratebay = require('./thepiratebay_api.js');
|
||||
const { Type } = require('../../lib/types');
|
||||
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'ThePirateBay';
|
||||
const CSV_FILE_PATH = '/tmp/tpb.csv';
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 40 });
|
||||
|
||||
async function scrape() {
|
||||
// await processTorrentRecord({ torrentId: 26877339, category: 'Video' });
|
||||
console.log(`starting to scrape tpb dump...`);
|
||||
//const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
|
||||
const checkPoint = 4115000;
|
||||
|
||||
let entriesProcessed = 0;
|
||||
const lr = new LineByLineReader(CSV_FILE_PATH);
|
||||
lr.on('line', (line) => {
|
||||
if (entriesProcessed % 1000 === 0) {
|
||||
console.log(`Processed ${entriesProcessed} entries`);
|
||||
}
|
||||
if (entriesProcessed <= checkPoint) {
|
||||
entriesProcessed++;
|
||||
return;
|
||||
}
|
||||
|
||||
const row = line.match(/(?<=^|,)(".*"|[^,]*)(?=,|$)/g);
|
||||
if (row.length !== 10) {
|
||||
console.log(`Invalid row: ${line}`);
|
||||
return;
|
||||
}
|
||||
const torrent = {
|
||||
torrentId: row[0],
|
||||
title: row[1]
|
||||
.replace(/^"|"$/g, '')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/&\w{2,6};/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim(),
|
||||
size: parseInt(row[2], 10),
|
||||
category: row[4],
|
||||
subcategory: row[5],
|
||||
infoHash: row[7].toLowerCase() || decode(row[9]).infoHash,
|
||||
magnetLink: row[9],
|
||||
uploadDate: moment(row[8]).toDate(),
|
||||
};
|
||||
|
||||
if (!limiter.empty()) {
|
||||
lr.pause()
|
||||
}
|
||||
|
||||
limiter.schedule(() => processTorrentRecord(torrent)
|
||||
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
|
||||
.then(() => limiter.empty())
|
||||
.then((empty) => empty && lr.resume())
|
||||
.then(() => entriesProcessed++);
|
||||
});
|
||||
lr.on('error', (err) => {
|
||||
console.log(err);
|
||||
});
|
||||
lr.on('end', () => {
|
||||
console.log(`finished to scrape tpb dump!`);
|
||||
});
|
||||
}
|
||||
|
||||
const allowedCategories = [
|
||||
thepiratebay.Categories.VIDEO.MOVIES,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_HD,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
|
||||
thepiratebay.Categories.VIDEO.MOVIES_3D,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
const seriesCategories = [
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS,
|
||||
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
|
||||
];
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (record.category !== 'Video') {
|
||||
return createSkipTorrentEntry(record);
|
||||
}
|
||||
if (await getStoredTorrentEntry(record)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const torrentFound = await thepiratebay.torrent(record.torrentId);
|
||||
|
||||
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
|
||||
return createSkipTorrentEntry(record);
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: torrentFound.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: torrentFound.torrentId,
|
||||
title: torrentFound.name,
|
||||
size: torrentFound.size,
|
||||
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
|
||||
imdbId: torrentFound.imdbId,
|
||||
uploadDate: torrentFound.uploadDate,
|
||||
seeders: torrentFound.seeders,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, NAME };
|
||||
185
scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js
Normal file
185
scraper/scrapers/torrentgalaxy/torrentgalaxy_api.js
Normal file
@@ -0,0 +1,185 @@
|
||||
const cheerio = require('cheerio');
|
||||
const needle = require('needle');
|
||||
const moment = require('moment');
|
||||
const decode = require('magnet-uri');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { getRandomUserAgent } = require('../../lib/requestHelper');
|
||||
|
||||
const defaultProxies = [
|
||||
// 'https://torrentgalaxy.to',
|
||||
// 'https://torrentgalaxy.mx',
|
||||
'https://torrentgalaxy.su'
|
||||
];
|
||||
const defaultTimeout = 10000;
|
||||
|
||||
const Categories = {
|
||||
ANIME: '28',
|
||||
MOVIE_4K: '3',
|
||||
MOVIE_PACKS: '4',
|
||||
MOVIE_SD: '1',
|
||||
MOVIE_HD: '42',
|
||||
MOVIE_CAM: '45',
|
||||
MOVIE_BOLLYWOOD: '46',
|
||||
TV_SD: '5',
|
||||
TV_HD: '41',
|
||||
TV_PACKS: '6',
|
||||
TV_SPORT: '7',
|
||||
DOCUMENTARIES: '9'
|
||||
};
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`)))
|
||||
.then((body) => parseTorrentPage(body))
|
||||
.then((torrent) => ({ torrentId, ...torrent }))
|
||||
.catch((err) => torrent(torrentId, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(keyword, config = {}, retries = 2) {
|
||||
if (!keyword || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${keyword} search`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}&search=${keyword}`)))
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch(() => search(keyword, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2, error = null) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(error || new Error(`Failed browse request`));
|
||||
}
|
||||
const proxyList = config.proxyList || defaultProxies;
|
||||
const page = config.page || 1;
|
||||
const category = config.category;
|
||||
|
||||
return Promises.first(proxyList
|
||||
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}`)))
|
||||
.then((body) => parseTableBody(body))
|
||||
.catch((err) => browse(config, retries - 1, err));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl) {
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: defaultTimeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then((response) => {
|
||||
const body = response.body;
|
||||
if (!body) {
|
||||
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
|
||||
} else if (body.includes('Access Denied')) {
|
||||
console.log(`Access Denied: ${requestUrl}`);
|
||||
throw new Error(`Access Denied: ${requestUrl}`);
|
||||
} else if (body.includes('502: Bad gateway') ||
|
||||
body.includes('403 Forbidden') ||
|
||||
body.includes('Origin DNS error')) {
|
||||
throw new Error(`Invalid body contents: ${requestUrl}`);
|
||||
}
|
||||
return body;
|
||||
});
|
||||
}
|
||||
|
||||
function parseTableBody(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
|
||||
const torrents = [];
|
||||
|
||||
$('.tgxtable > div').each((i, element) => {
|
||||
if (i === 0) return;
|
||||
const row = $(element);
|
||||
const magnetLink = row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(2)').attr('href');
|
||||
const imdbIdMatch = row.html().match(/search=(tt\d+)/i);
|
||||
try {
|
||||
torrents.push({
|
||||
name: row.find('.tgxtablecell div a[title]').first().text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
torrentLink: row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(1)').first().attr('href'),
|
||||
torrentId: row.find('.tgxtablecell div a[title]').first().attr('href').match(/torrent\/(\d+)/)[1],
|
||||
verified: !!row.find('i.fa-check').length,
|
||||
category: row.find('div:nth-of-type(n+2) .shrink a').first().attr('href').match(/cat=(\d+)$/)[1],
|
||||
seeders: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'green\'] b').first().text()),
|
||||
leechers: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'#ff0000\'] b').first().text()),
|
||||
languages: row.find('.tgxtablecell img[title]').first().attr('title'),
|
||||
size: parseSize(row.find('.collapsehide span.badge-secondary').first().text()),
|
||||
uploadDate: parseDate(row.find('div.collapsehide:nth-of-type(12)').first().text()),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1],
|
||||
});
|
||||
} catch (e) {
|
||||
console.error('Failed parsing TorrentGalaxy row: ', e);
|
||||
}
|
||||
});
|
||||
|
||||
resolve(torrents);
|
||||
});
|
||||
}
|
||||
|
||||
function parseTorrentPage(body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (!$) {
|
||||
reject(new Error('Failed loading body'));
|
||||
}
|
||||
const content = $('div[class="torrentpagetable limitwidth"]').first();
|
||||
const magnetLink = $('a[class="btn btn-danger"]').attr('href');
|
||||
const imdbIdContent = $('a[title="IMDB link"]').attr('href');
|
||||
const imdbIdMatch = imdbIdContent && imdbIdContent.match(/imdb\.com\/title\/(tt\d+)/i);
|
||||
|
||||
const torrent = {
|
||||
name: content.find('.linebreakup a').first().text(),
|
||||
infoHash: decode(magnetLink).infoHash,
|
||||
magnetLink: magnetLink,
|
||||
verified: !content.find('i.fa-exclamation-triangle').length,
|
||||
torrentLink: $('a[class="btn btn-success"]').attr('href'),
|
||||
seeders: parseInt(content.find('font[color=\'green\']').first().text(), 10),
|
||||
category: content.find('div:nth-of-type(4) a:nth-of-type(2)').first().attr('href').match(/cat=(\d+)$/)[1],
|
||||
languages: content.find('div:nth-of-type(5) div:nth-of-type(2)').first().text().trim(),
|
||||
size: parseSize(content.find('div:nth-of-type(6) div:nth-of-type(2)').first().text()),
|
||||
uploadDate: parseDate(content.find('div:nth-of-type(9) div:nth-of-type(2)').first().text()),
|
||||
imdbId: imdbIdMatch && imdbIdMatch[1],
|
||||
};
|
||||
resolve(torrent);
|
||||
});
|
||||
}
|
||||
|
||||
function parseSize(sizeText) {
|
||||
if (!sizeText) {
|
||||
return undefined;
|
||||
}
|
||||
let scale = 1;
|
||||
if (sizeText.includes('GB')) {
|
||||
scale = 1024 * 1024 * 1024
|
||||
} else if (sizeText.includes('MB')) {
|
||||
scale = 1024 * 1024;
|
||||
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
|
||||
scale = 1024;
|
||||
}
|
||||
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
|
||||
}
|
||||
|
||||
function parseDate(dateString) {
|
||||
if (dateString.includes('ago')) {
|
||||
const amount = parseInt(dateString, 10);
|
||||
const unit = dateString.includes('Min') ? 'minutes' : 'hours';
|
||||
return moment().subtract(amount, unit).toDate();
|
||||
}
|
||||
const preparedDate = dateString.replace(/\//g, '-').replace(/-(\d{2})\s/, '-20$1 ')
|
||||
return moment(preparedDate, 'DD-MM-YYYY HH:mm').toDate();
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse, Categories };
|
||||
132
scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js
Normal file
132
scraper/scrapers/torrentgalaxy/torrentgalaxy_scraper.js
Normal file
@@ -0,0 +1,132 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const torrentGalaxy = require('./torrentgalaxy_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'TorrentGalaxy';
|
||||
const TYPE_MAPPING = typeMapping();
|
||||
|
||||
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
const allowedCategories = [
|
||||
torrentGalaxy.Categories.ANIME,
|
||||
torrentGalaxy.Categories.MOVIE_4K,
|
||||
torrentGalaxy.Categories.MOVIE_PACKS,
|
||||
torrentGalaxy.Categories.MOVIE_SD,
|
||||
torrentGalaxy.Categories.MOVIE_HD,
|
||||
torrentGalaxy.Categories.MOVIE_CAM,
|
||||
torrentGalaxy.Categories.MOVIE_BOLLYWOOD,
|
||||
torrentGalaxy.Categories.TV_SD,
|
||||
torrentGalaxy.Categories.TV_HD,
|
||||
torrentGalaxy.Categories.TV_PACKS,
|
||||
torrentGalaxy.Categories.DOCUMENTARIES,
|
||||
];
|
||||
const packCategories = [
|
||||
torrentGalaxy.Categories.MOVIE_PACKS,
|
||||
torrentGalaxy.Categories.TV_PACKS
|
||||
];
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
// const ids = ['14212584'];
|
||||
// return Promise.all(ids.map(id => limiter.schedule(() => torrentGalaxy.torrent(id)
|
||||
// .then(torrent => processTorrentRecord(torrent)))))
|
||||
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => torrentGalaxy.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
|
||||
.then(entries => entries.reduce((a, b) => a.concat(b), []));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(category, page = 1) {
|
||||
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
|
||||
return api_limiter.schedule(() => torrentGalaxy.browse({ category, page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
|
||||
? scrapeLatestTorrentsForCategory(category, page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (!record || !TYPE_MAPPING[record.category] || !record.verified) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
provider: NAME,
|
||||
infoHash: record.infoHash,
|
||||
torrentId: record.torrentId,
|
||||
torrentLink: record.torrentLink,
|
||||
title: record.name.replace(/\t|\s+/g, ' '),
|
||||
type: TYPE_MAPPING[record.category],
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: record.imdbId,
|
||||
pack: packCategories.includes(record.category),
|
||||
languages: !(record.languages || '').includes('Other') ? record.languages : undefined
|
||||
};
|
||||
|
||||
if (await checkAndUpdateTorrent(torrent)) {
|
||||
return torrent;
|
||||
}
|
||||
const isOld = moment(torrent.uploadDate).isBefore(moment().subtract(18, 'month'));
|
||||
if (torrent.seeders === 0 && isOld && !torrent.pack) {
|
||||
console.log(`Skipping old unseeded torrent [${torrent.infoHash}] ${torrent.title}`)
|
||||
return torrent;
|
||||
}
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
function typeMapping() {
|
||||
const mapping = {};
|
||||
mapping[torrentGalaxy.Categories.MOVIE_SD] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.MOVIE_HD] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.MOVIE_4K] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.MOVIE_CAM] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.MOVIE_PACKS] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.MOVIE_BOLLYWOOD] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.DOCUMENTARIES] = Type.MOVIE;
|
||||
mapping[torrentGalaxy.Categories.TV_SD] = Type.SERIES;
|
||||
mapping[torrentGalaxy.Categories.TV_HD] = Type.SERIES;
|
||||
mapping[torrentGalaxy.Categories.TV_PACKS] = Type.SERIES;
|
||||
mapping[torrentGalaxy.Categories.TV_SPORT] = Type.SERIES;
|
||||
mapping[torrentGalaxy.Categories.ANIME] = Type.ANIME;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
function getMaxPage(category) {
|
||||
switch (category) {
|
||||
case torrentGalaxy.Categories.TV_SD:
|
||||
case torrentGalaxy.Categories.TV_HD:
|
||||
case torrentGalaxy.Categories.MOVIE_SD:
|
||||
case torrentGalaxy.Categories.MOVIE_HD:
|
||||
return 5;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
92
scraper/scrapers/yts/yts_api.js
Normal file
92
scraper/scrapers/yts/yts_api.js
Normal file
@@ -0,0 +1,92 @@
|
||||
const needle = require('needle');
|
||||
const Promises = require('../../lib/promises');
|
||||
const { getRandomUserAgent } = require('./../../lib/requestHelper');
|
||||
|
||||
const defaultProxies = [
|
||||
'https://yts.mx'
|
||||
];
|
||||
const defaultTimeout = 30000;
|
||||
const limit = 50;
|
||||
|
||||
function torrent(torrentId, config = {}, retries = 2) {
|
||||
if (!torrentId || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${torrentId} search`));
|
||||
}
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/movie_details.json?movie_id=${torrentId}`, config)))
|
||||
.then(body => parseResults(body))
|
||||
.catch(error => torrent(torrentId, config, retries - 1));
|
||||
}
|
||||
|
||||
function search(query, config = {}, retries = 2) {
|
||||
if (!query || retries === 0) {
|
||||
return Promise.reject(new Error(`Failed ${query} search`));
|
||||
}
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/list_movies.json?limit=${limit}&query_term=${query}`, config)))
|
||||
.then(results => parseResults(results))
|
||||
.catch(error => search(query, config, retries - 1));
|
||||
}
|
||||
|
||||
function browse(config = {}, retries = 2) {
|
||||
if (retries === 0) {
|
||||
return Promise.reject(new Error(`Failed browse request`));
|
||||
}
|
||||
const page = config.page || 1;
|
||||
|
||||
return Promises.first(defaultProxies
|
||||
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/list_movies.json?limit=${limit}&page=${page}`, config)))
|
||||
.then(results => parseResults(results))
|
||||
.catch(error => browse(config, retries - 1));
|
||||
}
|
||||
|
||||
function singleRequest(requestUrl, config = {}) {
|
||||
const timeout = config.timeout || defaultTimeout;
|
||||
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
|
||||
|
||||
return needle('get', requestUrl, options)
|
||||
.then(response => {
|
||||
if (!response.body) {
|
||||
return Promise.reject(`No body: ${requestUrl}`);
|
||||
}
|
||||
return Promise.resolve(response.body);
|
||||
});
|
||||
}
|
||||
|
||||
function parseResults(results) {
|
||||
if (!results || !results.data || (!results.data.movie && !Array.isArray(results.data.movies))) {
|
||||
console.log('Incorrect results: ', results);
|
||||
return Promise.reject('Incorrect results')
|
||||
}
|
||||
return (results.data.movies || [results.data.movie])
|
||||
.filter(movie => Array.isArray(movie.torrents))
|
||||
.map(movie => parseMovie(movie))
|
||||
.reduce((a, b) => a.concat(b), []);
|
||||
}
|
||||
|
||||
function parseMovie(movie) {
|
||||
return movie.torrents.map(torrent => ({
|
||||
name: `${movie.title} ${movie.year} ${torrent.quality} ${formatType(torrent.type)} `,
|
||||
torrentId: `${movie.id}-${torrent.hash.trim().toLowerCase()}`,
|
||||
infoHash: torrent.hash.trim().toLowerCase(),
|
||||
torrentLink: torrent.url,
|
||||
seeders: torrent.seeds,
|
||||
size: torrent.size_bytes,
|
||||
uploadDate: new Date(torrent.date_uploaded_unix * 1000),
|
||||
imdbId: movie.imdb_code
|
||||
}));
|
||||
}
|
||||
|
||||
function formatType(type) {
|
||||
if (type === 'web') {
|
||||
return 'WEBRip';
|
||||
}
|
||||
if (type === 'bluray') {
|
||||
return 'BluRay';
|
||||
}
|
||||
return type.toUpperCase();
|
||||
}
|
||||
|
||||
module.exports = { torrent, search, browse };
|
||||
71
scraper/scrapers/yts/yts_scraper.js
Normal file
71
scraper/scrapers/yts/yts_scraper.js
Normal file
@@ -0,0 +1,71 @@
|
||||
const moment = require('moment');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const yts = require('./yts_api');
|
||||
const { Type } = require('../../lib/types');
|
||||
const repository = require('../../lib/repository');
|
||||
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
|
||||
|
||||
const NAME = 'YTS';
|
||||
const UNTIL_PAGE = 2;
|
||||
|
||||
const limiter = new Bottleneck({ maxConcurrent: 10 });
|
||||
|
||||
async function scrape() {
|
||||
const scrapeStart = moment();
|
||||
const lastScrape = await repository.getProvider({ name: NAME });
|
||||
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
|
||||
|
||||
return scrapeLatestTorrents()
|
||||
.then(() => {
|
||||
lastScrape.lastScraped = scrapeStart;
|
||||
return lastScrape.save();
|
||||
})
|
||||
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
|
||||
}
|
||||
|
||||
async function updateSeeders(torrent) {
|
||||
return limiter.schedule(() => yts.torrent(torrent.torrentId));
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrents() {
|
||||
return scrapeLatestTorrentsForCategory();
|
||||
}
|
||||
|
||||
async function scrapeLatestTorrentsForCategory(page = 1) {
|
||||
console.log(`Scrapping ${NAME} page ${page}`);
|
||||
return yts.browse(({ page }))
|
||||
.catch(error => {
|
||||
console.warn(`Failed ${NAME} scrapping for [${page}] due: `, error);
|
||||
return Promise.resolve([]);
|
||||
})
|
||||
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
|
||||
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
|
||||
? scrapeLatestTorrentsForCategory(page + 1)
|
||||
: Promise.resolve());
|
||||
}
|
||||
|
||||
async function processTorrentRecord(record) {
|
||||
if (await checkAndUpdateTorrent(record)) {
|
||||
return record;
|
||||
}
|
||||
|
||||
if (!record || !record.size) {
|
||||
return Promise.resolve('Invalid torrent record');
|
||||
}
|
||||
|
||||
const torrent = {
|
||||
infoHash: record.infoHash,
|
||||
provider: NAME,
|
||||
torrentId: record.torrentId,
|
||||
title: record.name.replace(/\t|\s+/g, ' ').trim(),
|
||||
type: Type.MOVIE,
|
||||
size: record.size,
|
||||
seeders: record.seeders,
|
||||
uploadDate: record.uploadDate,
|
||||
imdbId: record.imdbId,
|
||||
};
|
||||
|
||||
return createTorrentEntry(torrent).then(() => torrent);
|
||||
}
|
||||
|
||||
module.exports = { scrape, updateSeeders, NAME };
|
||||
Reference in New Issue
Block a user