Added back original scrapers, integrated with PGSQL

This commit is contained in:
Gabisonfire
2024-01-17 16:20:00 -05:00
parent 364d1545b1
commit cab7f38c66
79 changed files with 65832 additions and 1702 deletions

View File

@@ -1,4 +0,0 @@
*.lock
*/scrape-cache
Dockerfile
docker-compose.yml

View File

@@ -1,12 +0,0 @@
FROM python:3.10.13
COPY . .
RUN pip --no-cache-dir install pipenv && \
apt-get update && apt-get install sqlite3 && \
apt-get clean autoclean && \
apt-get autoremove --yes && \
mkdir /sqlite && \
sqlite3 /sqlite/torrentio.sqlite "VACUUM;" && \
cat torrentio.sql | sqlite3 /sqlite/torrentio.sqlite && \
ls -l /sqlite && \
pipenv install && touch .init
CMD [ "pipenv", "run", "python", "-u", "scraper.py" ]

View File

@@ -1,16 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
requests = "*"
beautifulsoup4 = "*"
imdbpy = "*"
apscheduler = "*"
[dev-packages]
[requires]
python_version = "3.10"
python_full_version = "3.10.13"

View File

@@ -1,15 +0,0 @@
{
"1337x": {
"sleep": 10,
"urls_to_scrape": [
"https://1337x.to/trending/w/movies/",
"https://1337x.to/trending/d/movies/"
],
"scrape_interval": {
"days": 0,
"hours" : 1,
"minutes": 0,
"seconds": 0
}
}
}

View File

@@ -2,30 +2,43 @@ version: '2'
services:
mongodb:
restart: always
image: docker.io/bitnami/mongodb:7.0
ports:
- "27017:27017"
volumes:
- 'mongodb_data:/bitnami/mongodb'
scraper:
build: ./
build: ../../scraper/
restart: always
volumes:
- 'sqlite_data:/sqlite'
ports:
- "7001:7001"
environment:
- PORT=7001
- MONGODB_URI=mongodb://mongodb:27017/torrentio
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
torrentio:
build: ../
volumes:
- 'sqlite_data:/sqlite'
restart: always
ports:
- "7000:7000"
links:
- "mongodb:mongodb"
environment:
- MONGODB_URI=mongodb://mongodb:27017/torrentio
- DATABASE_URI=sqlite:/sqlite/torrentio.sqlite
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
postgres:
image: postgres:14-alpine
ports:
- 5432:5432
volumes:
- pg_data:/var/lib/postgresql/data
- './init:/docker-entrypoint-initdb.d'
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
- POSTGRES_USER=postgres
- POSTGRES_DB=torrentio
volumes:
mongodb_data:
driver: local
sqlite_data:
pg_data:
driver: local

View File

@@ -0,0 +1,53 @@
CREATE TABLE "torrents" (
"infoHash" varchar(64) PRIMARY KEY,
"provider" varchar(32) NOT NULL,
"torrentId" varchar(128),
"title" varchar(256) NOT NULL,
"size" bigint,
"type" varchar(16) NOT NULL,
"uploadDate" date NOT NULL,
"seeders" integer,
"trackers" varchar(4096),
"languages" varchar(4096),
"resolution" varchar(16),
"createdAt" date,
"updatedAt" date,
"reviewed" boolean,
"opened" boolean
);
CREATE TABLE "files" (
"id" SERIAL PRIMARY KEY,
"infoHash" varchar(64) NOT NULL,
"fileIndex" integer,
"title" varchar(256) NOT NULL,
"size" bigint,
"imdbId" varchar(32),
"imdbSeason" integer,
"imdbEpisode" integer,
"kitsuId" integer,
"kitsuEpisode" integer,
"createdAt" date,
"updatedAt" date,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
UNIQUE("infoHash","fileIndex")
);
CREATE TABLE "subtitles" (
"id" SERIAL PRIMARY KEY,
"infoHash" varchar(64) NOT NULL,
"fileIndex" integer NOT NULL,
"fileId" bigint,
"title" varchar(512) NOT NULL,
"size" bigint,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
FOREIGN KEY("fileId") REFERENCES files("id") ON DELETE SET NULL
);
CREATE TABLE "contents" (
"infoHash" varchar(64),
"fileIndex" integer,
"path" varchar(256),
"size" bigint,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE
);

View File

@@ -1,141 +0,0 @@
import os
import re
import math
import requests
import time
from imdb import Cinemagoer
from bs4 import BeautifulSoup
from shared import imdb_find, build_and_write, extract_title, read_config
TORRENT_CACHES = ('http://itorrents.org', 'http://torrage.info', 'http://btcache.me')
BASE_URL = 'https://1337x.to'
MOVIE_BASE = 'https://1337x.to/movie-library'
MOVIE_LIBRARY_MAX_PAGE = 301
CACHE_DIR = "./scrape-cache"
PROVIDER = "1337x"
SLEEP_BETWEEN_REQUESTS = read_config(PROVIDER, "sleep")
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
def get_links_and_process(url):
links = []
print(f"Requesting movies from: {url}")
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
main = req.text
soup = BeautifulSoup(main, "html.parser")
for a in soup.find_all("a"):
if a.get("href").startswith("/torrent/"):
links.append((a.get("href"), extract_title(a.text)))
process_links(links)
def get_links_initial():
links = []
for i in range(1,MOVIE_LIBRARY_MAX_PAGE + 1):
try:
print(f"Sleeping {SLEEP_BETWEEN_REQUESTS}")
time.sleep(SLEEP_BETWEEN_REQUESTS)
main = ""
if os.path.isfile(f"{CACHE_DIR}/main-{i}.html"):
print(f"Reading main page({i}) from cache...")
main = open(f"{CACHE_DIR}/main-{i}.html", "r").read()
else:
print(f"Requesting main index: {MOVIE_BASE}/{i}/")
req = requests.get(f"{MOVIE_BASE}/{i}/", headers={'User-Agent': 'Mozilla/5.0'})
if req.status_code == 404:
print(f"Page does not exist: {MOVIE_BASE}/{i}/. Breaking loop.")
break
main = req.text
open(f"{CACHE_DIR}/main-{i}.html", "w+").write(main)
movies = []
soup = BeautifulSoup(main, "html.parser")
for h3 in soup.find_all("h3"):
a = h3.findChildren("a", href=True)[0]
movie_link = a.get("href")
movie_title = a.text
movies.append((movie_title, movie_link))
for movie in movies:
if os.path.isfile(f"{CACHE_DIR}{movie[1]}html.html"):
print(f"Reading movie page({movie[0]}) from cache...")
main = open(f"{CACHE_DIR}{movie[1]}html.html").read()
else:
print(f"Requesting movie releases: {BASE_URL}{movie[1]}")
req = requests.get(f"{BASE_URL}{movie[1]}", headers={'User-Agent': 'Mozilla/5.0'})
main = req.text
if not os.path.exists(f"{CACHE_DIR}{movie[1]}"):
os.makedirs(f"{CACHE_DIR}{movie[1]}")
open(f"{CACHE_DIR}{movie[1]}html.html", "w+").write(main)
soup = BeautifulSoup(main, "html.parser")
for href in soup.find_all("a"):
if href.get("href").startswith("/torrent/"):
links.append((href.get("href"), movie[0]))
except Exception as e:
print(e)
return links
def process_links(links):
print(f"Checking links...({len(links)})")
counter = 1
for link in links:
try:
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
torrent_html = req.text
t = {}
soup = BeautifulSoup(torrent_html, "html.parser")
t['title'] = soup.find("h1").text.strip()
t['size'] = 0
t['magnets'] = []
t['torrents'] = []
all_a = soup.find_all("a")
for a in all_a:
if a.get("href").startswith("https://www.imdb.com/title"):
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
if a.get("href").startswith("magnet:"):
t['magnets'].append(a.get("href"))
if a.get("href").startswith(TORRENT_CACHES):
t['torrents'].append(a.get("href"))
all_li = soup.find_all("li")
for li in all_li:
if "Total size" in li.text:
size = li.findChildren("span")[0].text
mb = False
if "MB" in size: mb = True
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
if mb:
t['size'] = math.trunc(float(size) * 107374182)
else:
t['size'] = math.trunc(float(size) * 1073741824)
t['seeders'] = soup.find("span", {"class": "seeds"}).text
all_p = soup.find_all("p")
for p in all_p:
if "Infohash :" in p.text:
t['infoHash'] = p.findChildren("span")[0].text.lower()
t['files'] = []
file_div = soup.find("div", {"id":"files"})
for li in file_div.findChildren("li"):
f = re.sub('\s\(.*\)', '', li.text)
t["files"].append(f)
t['trackers'] = []
tracker_div = soup.find("div", {"id":"tracker-list"})
for tracker in tracker_div.findChildren("li"):
t['trackers'].append(tracker.text.strip())
if not 'imdbid' in t or t['imdbid'] == '':
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
if found is not None:
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
else:
new_id = imdb_find(link[1])
if new_id is not None:
t['imdbid'] = f"tt{new_id}"
else:
print(f"{t['title']} has no IMDB Id")
continue
build_and_write(t)
except:
counter += 1
continue
counter += 1

View File

@@ -1,41 +0,0 @@
import os
import threading
from p1337x import process_links, get_links_initial, get_links_and_process
from apscheduler.schedulers.background import BackgroundScheduler
from shared import read_config
print("Scaper starting...")
if os.path.isfile(".init"):
print("Found init file, executing initial sync. Be patient.")
process_links(get_links_initial())
os.remove(".init")
sched = BackgroundScheduler(timezone="America/New_York")
sched.start()
# 1337x
PROVIDER = "1337x"
pages = read_config(PROVIDER, "urls_to_scrape")
interval = read_config(PROVIDER, "scrape_interval")
for page in pages:
j = sched.add_job(
get_links_and_process,
'interval',
days=interval["days"],
hours=interval["hours"],
minutes=interval["minutes"],
seconds=interval["seconds"],
id=page,
args=[page],
max_instances=1)
print(f"{page} willl be scraped {j.next_run_time}.")
# Wait forever
main_thread = threading.main_thread()
while True:
L = threading.enumerate()
L.remove(main_thread) # or avoid it in the for loop
for t in L:
t.join()

View File

@@ -1,73 +0,0 @@
import sqlite3
import re
import os
import json
from imdb import Cinemagoer
SQLITE_PATH = "/sqlite/torrentio.sqlite"
ia = Cinemagoer()
CONFIG = "config.json"
def read_config(provider, key):
if os.path.isfile(CONFIG):
f = open(CONFIG, "r")
cfg = json.load(f)
return cfg[provider][key]
def filter_file(file):
allowed_ext = ['.mp4', '.mkv', '.avi', '.mpeg', '.mpg', '.mpv', '.mov']
if os.path.splitext(file)[1] in allowed_ext:
return True
return False
def create_connection(db_file):
conn = None
try:
conn = sqlite3.connect(db_file, check_same_thread=False)
except Exception as e:
print(e)
exit(1)
return conn
sqlite = create_connection(SQLITE_PATH)
def build_and_write(torrent):
try:
print(f"Recording {torrent['title']} in the database")
q = f"INSERT OR REPLACE INTO torrents (infoHash, provider, title, size, type, uploadDate, seeders, trackers) VALUES (?,?,?,?,?,?,?,?)"
p = (torrent['infoHash'],'1337x',torrent['title'],torrent['size'],'movie','1/1/2024',torrent['seeders'],','.join(torrent['trackers']))
cursor = sqlite.cursor()
cursor.execute(q,p)
for file in torrent['files']:
if filter_file(file):
q = f"INSERT OR REPLACE INTO files (infoHash, fileIndex, title, size, imdbId) VALUES (?,?,?,?,?)"
p = (torrent['infoHash'], torrent['files'].index(file), file, torrent['size'], torrent['imdbid'])
cursor.execute(q,p)
sqlite.commit()
cursor.close()
except sqlite3.Error as error:
print(error)
def imdb_find(name):
movie = ia.search_movie(name)
if len(movie) >= 1:
return movie[0].movieID
return None
def extract_title(filename):
try:
filename.strip()
filename = filename.replace('.', ' ')
res = re.search('([^\\\]+)\.(avi|mkv|mpeg|mpg|mov|mp4)$', filename)
if res:
filename = res.group(1)
res = re.search('(.*?)(dvdrip|xvid| cd[0-9]|dvdscr|brrip|divx|[\{\(\[]?[0-9]{4}).*', filename)
if res:
filename = res.group(1)
res = re.search('(.*?)\(.*\)(.*)', filename)
if res:
filename = res.group(1)
return filename
except:
return ""

View File

@@ -1,44 +0,0 @@
CREATE TABLE "torrents" (
"infoHash" TEXT,
"provider" TEXT NOT NULL,
"torrentId" TEXT,
"title" TEXT NOT NULL,
"size" INTEGER,
"type" TEXT NOT NULL,
"uploadDate" TEXT NOT NULL,
"seeders" INTEGER,
"trackers" TEXT,
"languages" TEXT,
"resolution" TEXT,
"createdAt" TEXT,
"updatedAt" TEXT,
PRIMARY KEY("infoHash")
);
CREATE TABLE "files" (
"id" INTEGER,
"infoHash" TEXT NOT NULL,
"fileIndex" TEXT,
"title" INTEGER,
"size" INTEGER,
"imdbId" TEXT,
"imdbSeason" INTEGER,
"imdbEpisode" INTEGER,
"kitsuId" INTEGER,
"kitsuEpisode" INTEGER,
"createdAt" TEXT,
"updatedAt" TEXT,
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE,
PRIMARY KEY("id" AUTOINCREMENT)
UNIQUE(infoHash, fileIndex)
);
CREATE TABLE "subtitles" (
"infoHash" TEXT NOT NULL,
"fileIndex" INTEGER NOT NULL,
"fileId" INTEGER,
"title" TEXT NOT NULL,
"size" INTEGER NOT NULL,
FOREIGN KEY("fileId") REFERENCES "file"("id") ON DELETE SET NULL
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE
);

1384
addon/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -23,14 +23,14 @@
"named-queue": "^2.2.1",
"offcloud-api": "^1.0.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#022408972c2a040f846331a912a6a8487746a654",
"pg": "^8.10.0",
"pg": "^8.11.3",
"pg-hstore": "^2.3.4",
"premiumize-api": "^1.0.3",
"prom-client": "^12.0.0",
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#d1f7eaa8593b947edbfbc8a92a176448b48ef445",
"request-ip": "^3.3.0",
"router": "^1.3.8",
"sequelize": "^6.31.1",
"sqlite3": "^5.1.7",
"stremio-addon-sdk": "^1.6.10",
"swagger-stats": "^0.99.7",
"ua-parser-js": "^1.0.36",

12
scraper/Dockerfile Normal file
View File

@@ -0,0 +1,12 @@
FROM node:16-alpine
RUN apk update && apk upgrade && \
apk add --no-cache git
WORKDIR /home/node/app
COPY package*.json ./
RUN npm ci --only-production
COPY . .
CMD [ "node", "--insecure-http-parser", "index.js" ]

37
scraper/README.md Normal file
View File

@@ -0,0 +1,37 @@
# Torrentio Scraper
## Initial dumps
### The Pirate Bay
https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA
https://thepiratebay.org/static/dump/csv/
### Kickass
https://mega.nz/#F!tktzySBS!ndSEaK3Z-Uc3zvycQYxhJA
https://web.archive.org/web/20150416071329/http://kickass.to/api
### RARBG
Scrape movie and tv catalog using [www.webscraper.io](https://www.webscraper.io/) for available `imdbIds` and use those via the api to search for torrents.
Movies sitemap
```json
{"_id":"rarbg-movies","startUrl":["https://rarbgmirror.org/catalog/movies/[1-4235]"],"selectors":[{"id":"rarbg-movie-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
```
TV sitemap
```json
{"_id":"rarbg-tv","startUrl":["https://rarbgmirror.org/catalog/tv/[1-609]"],"selectors":[{"id":"rarbg-tv-imdb-id","type":"SelectorHTML","parentSelectors":["_root"],"selector":".lista-rounded table td[width='110']","multiple":true,"regex":"tt[0-9]+","delay":0}]}
```
### Migrating Database
When migrating database to a new one it is important to alter the `files_id_seq` sequence to the maximum file id value plus 1.
```sql
ALTER SEQUENCE files_id_seq RESTART WITH <last_file_id + 1>;
```

14
scraper/index.js Normal file
View File

@@ -0,0 +1,14 @@
const express = require("express");
const server = express();
const { connect } = require('./lib/repository');
const { startScraper } = require('./scheduler/scheduler')
server.get('/', function (req, res) {
res.sendStatus(200);
});
server.listen(process.env.PORT || 7000, async () => {
await connect();
console.log('Scraper started');
startScraper();
});

72
scraper/lib/cache.js Normal file
View File

@@ -0,0 +1,72 @@
const cacheManager = require('cache-manager');
const mangodbStore = require('cache-manager-mongodb');
const GLOBAL_KEY_PREFIX = 'stremio-torrentio';
const IMDB_ID_PREFIX = `${GLOBAL_KEY_PREFIX}|imdb_id`;
const KITSU_ID_PREFIX = `${GLOBAL_KEY_PREFIX}|kitsu_id`;
const METADATA_PREFIX = `${GLOBAL_KEY_PREFIX}|metadata`;
const TRACKERS_KEY_PREFIX = `${GLOBAL_KEY_PREFIX}|trackers`;
const GLOBAL_TTL = process.env.METADATA_TTL || 7 * 24 * 60 * 60; // 7 days
const MEMORY_TTL = process.env.METADATA_TTL || 2 * 60 * 60; // 2 hours
const TRACKERS_TTL = 2 * 24 * 60 * 60; // 2 days
const MONGO_URI = process.env.MONGODB_URI;
const memoryCache = initiateMemoryCache();
const remoteCache = initiateRemoteCache();
function initiateRemoteCache() {
if (MONGO_URI) {
return cacheManager.caching({
store: mangodbStore,
uri: MONGO_URI,
options: {
collection: 'torrentio_scraper_collection',
useUnifiedTopology: true
},
ttl: GLOBAL_TTL,
ignoreCacheErrors: true
});
} else {
return cacheManager.caching({
store: 'memory',
ttl: GLOBAL_TTL
});
}
}
function initiateMemoryCache() {
return cacheManager.caching({
store: 'memory',
ttl: MEMORY_TTL
});
}
function cacheWrap(cache, key, method, options) {
return cache.wrap(key, method, options);
}
function cacheWrapImdbId(key, method) {
return cacheWrap(remoteCache, `${IMDB_ID_PREFIX}:${key}`, method, { ttl: GLOBAL_TTL });
}
function cacheWrapKitsuId(key, method) {
return cacheWrap(remoteCache, `${KITSU_ID_PREFIX}:${key}`, method, { ttl: GLOBAL_TTL });
}
function cacheWrapMetadata(id, method) {
return cacheWrap(memoryCache, `${METADATA_PREFIX}:${id}`, method, { ttl: MEMORY_TTL });
}
function cacheTrackers(method) {
return cacheWrap(memoryCache, `${TRACKERS_KEY_PREFIX}`, method, { ttl: TRACKERS_TTL });
}
module.exports = {
cacheWrapImdbId,
cacheWrapKitsuId,
cacheWrapMetadata,
cacheTrackers
};

64
scraper/lib/extension.js Normal file
View File

@@ -0,0 +1,64 @@
const VIDEO_EXTENSIONS = [
"3g2",
"3gp",
"avi",
"flv",
"mkv",
"mk3d",
"mov",
"mp2",
"mp4",
"m4v",
"mpe",
"mpeg",
"mpg",
"mpv",
"webm",
"wmv",
"ogm",
"divx"
];
const SUBTITLE_EXTENSIONS = [
"aqt",
"gsub",
"jss",
"sub",
"ttxt",
"pjs",
"psb",
"rt",
"smi",
"slt",
"ssf",
"srt",
"ssa",
"ass",
"usf",
"idx",
"vtt"
];
const DISK_EXTENSIONS = [
"iso",
"m2ts",
"ts",
"vob"
]
function isVideo(filename) {
return isExtension(filename, VIDEO_EXTENSIONS);
}
function isSubtitle(filename) {
return isExtension(filename, SUBTITLE_EXTENSIONS);
}
function isDisk(filename) {
return isExtension(filename, DISK_EXTENSIONS);
}
function isExtension(filename, extensions) {
const extensionMatch = filename.match(/\.(\w{2,4})$/);
return extensionMatch && extensions.includes(extensionMatch[1].toLowerCase());
}
module.exports = { isVideo, isSubtitle, isDisk }

155
scraper/lib/metadata.js Normal file
View File

@@ -0,0 +1,155 @@
const needle = require('needle');
const nameToImdb = require('name-to-imdb');
const googleIt = require('google-it');
const googleSr = require('google-sr');
const bing = require('nodejs-bing');
const he = require('he');
const { cacheWrapImdbId, cacheWrapKitsuId, cacheWrapMetadata } = require('./cache');
const { Type } = require('./types');
const { getRandomUserAgent } = require('./requestHelper');
const CINEMETA_URL = 'https://v3-cinemeta.strem.io';
const KITSU_URL = 'https://anime-kitsu.strem.fun';
const TIMEOUT = 20000;
function getMetadata(id, type = Type.SERIES) {
if (!id) {
return Promise.reject("no valid id provided");
}
const key = Number.isInteger(id) || id.match(/^\d+$/) ? `kitsu:${id}` : id;
const metaType = type === Type.MOVIE ? Type.MOVIE : Type.SERIES;
return cacheWrapMetadata(key, () => _requestMetadata(`${KITSU_URL}/meta/${metaType}/${key}.json`)
.catch(() => _requestMetadata(`${CINEMETA_URL}/meta/${metaType}/${key}.json`))
.catch(() => {
// try different type in case there was a mismatch
const otherType = metaType === Type.MOVIE ? Type.SERIES : Type.MOVIE;
return _requestMetadata(`${CINEMETA_URL}/meta/${otherType}/${key}.json`)
})
.catch((error) => {
throw new Error(`failed metadata query ${key} due: ${error.message}`);
}));
}
function _requestMetadata(url) {
return needle('get', url, { open_timeout: TIMEOUT })
.then((response) => {
const body = response.body;
if (body && body.meta && (body.meta.imdb_id || body.meta.kitsu_id)) {
return {
kitsuId: body.meta.kitsu_id,
imdbId: body.meta.imdb_id,
type: body.meta.type,
title: body.meta.name,
year: body.meta.year,
country: body.meta.country,
genres: body.meta.genres,
status: body.meta.status,
videos: (body.meta.videos || [])
.map((video) => Number.isInteger(video.imdbSeason)
? {
name: video.name || video.title,
season: video.season,
episode: video.episode,
imdbSeason: video.imdbSeason,
imdbEpisode: video.imdbEpisode
}
: {
name: video.name || video.title,
season: video.season,
episode: video.episode,
kitsuId: video.kitsu_id,
kitsuEpisode: video.kitsuEpisode,
released: video.released
}
),
episodeCount: Object.values((body.meta.videos || [])
.filter((entry) => entry.season !== 0 && entry.episode !== 0)
.sort((a, b) => a.season - b.season)
.reduce((map, next) => {
map[next.season] = map[next.season] + 1 || 1;
return map;
}, {})),
totalCount: body.meta.videos && body.meta.videos
.filter((entry) => entry.season !== 0 && entry.episode !== 0).length
};
} else {
throw new Error('No search results');
}
});
}
function escapeTitle(title) {
return title.toLowerCase()
.normalize('NFKD') // normalize non-ASCII characters
.replace(/[\u0300-\u036F]/g, '')
.replace(/&/g, 'and')
.replace(/[;, ~./]+/g, ' ') // replace dots, commas or underscores with spaces
.replace(/[^\w \-()+#@!'\u0400-\u04ff]+/g, '') // remove all non-alphanumeric chars
.replace(/^\d{1,2}[.#\s]+(?=(?:\d+[.\s]*)?[\u0400-\u04ff])/i, '') // remove russian movie numbering
.replace(/\s{2,}/, ' ') // replace multiple spaces
.trim();
}
function escapeHTML(title) {
return he.decode(title)
.replace(/&amp;/g, "&");
}
async function getImdbId(info, type) {
const name = escapeTitle(info.title);
const year = info.year || info.date && info.date.slice(0, 4);
const key = `${name}_${year}_${type}`;
const query = `${name} ${year || ''} ${type} imdb`;
return cacheWrapImdbId(key,
() => new Promise((resolve, reject) => {
nameToImdb({ name: encodeURIComponent(name), year: info.year, type }, function (err, res) {
if (res) {
resolve(res);
} else {
reject(err || new Error('failed imdbId search'));
}
});
// }).catch(() => googleIt({ query, userAgent: getRandomUserAgent(), disableConsole: true })
// .then(results => results.length ? results : Promise.reject('No results'))
}).catch(() => googleSr(query)
.then(response => response.searchResults.length ? response.searchResults : Promise.reject('No results'))
// .catch(() => bing.web(query))
.then(results => results
.map(result => result.link)
.find(result => result.includes('imdb.com/title/')))
.then(result => result && result.match(/imdb\.com\/title\/(tt\d+)/))
.then(match => match && match[1])))
.then(imdbId => imdbId && 'tt' + imdbId.replace(/tt0*([1-9][0-9]*)$/, '$1').padStart(7, '0'));
}
async function getKitsuId(info) {
const title = escapeTitle(info.title.replace(/\s\|\s.*/, ''));
const year = info.year ? ` ${info.year}` : '';
const season = info.season > 1 ? ` S${info.season}` : '';
const key = `${title}${year}${season}`;
const query = encodeURIComponent(key);
return cacheWrapKitsuId(key,
() => needle('get', `${KITSU_URL}/catalog/series/kitsu-anime-list/search=${query}.json`, { open_timeout: 60000 })
.then((response) => {
const body = response.body;
if (body && body.metas && body.metas.length) {
return body.metas[0].id.replace('kitsu:', '');
} else {
throw new Error('No search results');
}
}));
}
async function isEpisodeImdbId(imdbId) {
if (!imdbId) {
return false;
}
return needle('get', `https://www.imdb.com/title/${imdbId}/`, { open_timeout: 10000, follow: 2 })
.then(response => !!(response.body && response.body.includes('video.episode')))
.catch((err) => false);
}
module.exports = { getMetadata, getImdbId, getKitsuId, isEpisodeImdbId, escapeHTML, escapeTitle };

100
scraper/lib/parseHelper.js Normal file
View File

@@ -0,0 +1,100 @@
const { parse } = require('parse-torrent-title');
const { Type } = require('./types');
const MULTIPLE_FILES_SIZE = 4 * 1024 * 1024 * 1024; // 4 GB
function parseSeriesVideos(torrent, videos) {
const parsedTorrentName = parse(torrent.title);
const hasMovies = parsedTorrentName.complete || !!torrent.title.match(/movies?(?:\W|$)/i);
const parsedVideos = videos.map(video => parseSeriesVideo(video, parsedTorrentName));
return parsedVideos.map(video => ({ ...video, isMovie: isMovieVideo(video, parsedVideos, torrent.type, hasMovies) }));
}
function parseSeriesVideo(video, parsedTorrentName) {
const videoInfo = parse(video.name);
// the episode may be in a folder containing season number
if (!Number.isInteger(videoInfo.season) && video.path.includes('/')) {
const folders = video.path.split('/');
const pathInfo = parse(folders[folders.length - 2]);
videoInfo.season = pathInfo.season;
}
if (!Number.isInteger(videoInfo.season) && parsedTorrentName.season) {
videoInfo.season = parsedTorrentName.season;
}
if (!Number.isInteger(videoInfo.season) && videoInfo.seasons && videoInfo.seasons.length > 1) {
// in case single file was interpreted as having multiple seasons
videoInfo.season = videoInfo.seasons[0];
}
if (!Number.isInteger(videoInfo.season) && video.path.includes('/') && parsedTorrentName.seasons
&& parsedTorrentName.seasons.length > 1) {
// russian season are usually named with 'series name-2` i.e. Улицы разбитых фонарей-6/22. Одиночный выстрел.mkv
const folderPathSeasonMatch = video.path.match(/[\u0400-\u04ff]-(\d{1,2})(?=.*\/)/);
videoInfo.season = folderPathSeasonMatch && parseInt(folderPathSeasonMatch[1], 10) || undefined;
}
// sometimes video file does not have correct date format as in torrent title
if (!videoInfo.episodes && !videoInfo.date && parsedTorrentName.date) {
videoInfo.date = parsedTorrentName.date;
}
// limit number of episodes in case of incorrect parsing
if (videoInfo.episodes && videoInfo.episodes.length > 20) {
videoInfo.episodes = [videoInfo.episodes[0]];
videoInfo.episode = videoInfo.episodes[0];
}
// force episode to any found number if it was not parsed
if (!videoInfo.episodes && !videoInfo.date) {
const epMatcher = videoInfo.title.match(
/(?<!season\W*|disk\W*|movie\W*|film\W*)(?:^|\W)(\d{1,4})(?:a|b|c|v\d)?(?:\W|$)(?!disk|movie|film)/i);
videoInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
videoInfo.episode = videoInfo.episodes && videoInfo.episodes[0];
}
if (!videoInfo.episodes && !videoInfo.date) {
const epMatcher = video.name.match(new RegExp(`(?:\\(${videoInfo.year}\\)|part)[._ ]?(\\d{1,3})(?:\\b|_)`, "i"));
videoInfo.episodes = epMatcher && [parseInt(epMatcher[1], 10)];
videoInfo.episode = videoInfo.episodes && videoInfo.episodes[0];
}
return { ...video, ...videoInfo };
}
function isMovieVideo(video, otherVideos, type, hasMovies) {
if (Number.isInteger(video.season) && Array.isArray(video.episodes)) {
// not movie if video has season
return false;
}
if (video.name.match(/\b(?:\d+[ .]movie|movie[ .]\d+)\b/i)) {
// movie if video explicitly has numbered movie keyword in the name, ie. 1 Movie or Movie 1
return true;
}
if (!hasMovies && type !== Type.ANIME) {
// not movie if torrent name does not contain movies keyword or is not a pack torrent and is not anime
return false;
}
if (!video.episodes) {
// movie if there's no episode info it could be a movie
return true;
}
// movie if contains year info and there aren't more than 3 video with same title and year
// as some series titles might contain year in it.
return !!video.year
&& otherVideos.length > 3
&& otherVideos.filter(other => other.title === video.title && other.year === video.year) < 3;
}
function isPackTorrent(torrent) {
if (torrent.pack) {
return true;
}
const parsedInfo = parse(torrent.title);
if (torrent.type === Type.MOVIE) {
return parsedInfo.complete || typeof parsedInfo.year === 'string' || /movies/i.test(torrent.title);
}
const hasMultipleEpisodes = parsedInfo.complete ||
torrent.size > MULTIPLE_FILES_SIZE ||
(parsedInfo.seasons && parsedInfo.seasons.length > 1) ||
(parsedInfo.episodes && parsedInfo.episodes.length > 1) ||
(parsedInfo.seasons && !parsedInfo.episodes);
const hasSingleEpisode = Number.isInteger(parsedInfo.episode) || (!parsedInfo.episodes && parsedInfo.date);
return hasMultipleEpisodes && !hasSingleEpisode;
}
module.exports = { parseSeriesVideos, isPackTorrent }

57
scraper/lib/promises.js Normal file
View File

@@ -0,0 +1,57 @@
/**
* Execute promises in sequence one after another.
*/
async function sequence(promises) {
return promises.reduce((promise, func) =>
promise.then(result => func().then(Array.prototype.concat.bind(result))), Promise.resolve([]));
}
/**
* Return first resolved promise as the result.
*/
async function first(promises) {
return Promise.all(promises.map((p) => {
// If a request fails, count that as a resolution so it will keep
// waiting for other possible successes. If a request succeeds,
// treat it as a rejection so Promise.all immediately bails out.
return p.then(
(val) => Promise.reject(val),
(err) => Promise.resolve(err)
);
})).then(
// If '.all' resolved, we've just got an array of errors.
(errors) => Promise.reject(errors),
// If '.all' rejected, we've got the result we wanted.
(val) => Promise.resolve(val)
);
}
/**
* Delay promise
*/
async function delay(duration) {
return new Promise((resolve) => setTimeout(resolve, duration));
}
/**
* Timeout promise after a set time in ms
*/
async function timeout(timeoutMs, promise, message = 'Timed out') {
return Promise.race([
promise,
new Promise(function (resolve, reject) {
setTimeout(function () {
reject(message);
}, timeoutMs);
})
]);
}
/**
* Return most common value from given array.
*/
function mostCommonValue(array) {
return array.sort((a, b) => array.filter(v => v === a).length - array.filter(v => v === b).length).pop();
}
module.exports = { sequence, first, delay, timeout, mostCommonValue };

346
scraper/lib/repository.js Normal file
View File

@@ -0,0 +1,346 @@
const moment = require('moment');
const Promises = require('./promises')
const { Sequelize, DataTypes, fn, col, literal } = require('sequelize');
const Op = Sequelize.Op;
const DATABASE_URI = process.env.DATABASE_URI;
const database = new Sequelize(
DATABASE_URI,
{
logging: false
}
);
const Provider = database.define('provider', {
name: { type: DataTypes.STRING(32), primaryKey: true },
lastScraped: { type: DataTypes.DATE },
lastScrapedId: { type: DataTypes.STRING(128) }
});
const Torrent = database.define('torrent',
{
infoHash: { type: DataTypes.STRING(64), primaryKey: true },
provider: { type: DataTypes.STRING(32), allowNull: false },
torrentId: { type: DataTypes.STRING(512) },
title: { type: DataTypes.STRING(512), allowNull: false },
size: { type: DataTypes.BIGINT },
type: { type: DataTypes.STRING(16), allowNull: false },
uploadDate: { type: DataTypes.DATE, allowNull: false },
seeders: { type: DataTypes.SMALLINT },
trackers: { type: DataTypes.STRING(4096) },
languages: { type: DataTypes.STRING(4096) },
resolution: { type: DataTypes.STRING(16) },
reviewed: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: false },
opened: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: false }
}
);
const File = database.define('file',
{
id: { type: DataTypes.BIGINT, autoIncrement: true, primaryKey: true },
infoHash: {
type: DataTypes.STRING(64),
allowNull: false,
references: { model: Torrent, key: 'infoHash' },
onDelete: 'CASCADE'
},
fileIndex: { type: DataTypes.INTEGER },
title: { type: DataTypes.STRING(512), allowNull: false },
size: { type: DataTypes.BIGINT },
imdbId: { type: DataTypes.STRING(32) },
imdbSeason: { type: DataTypes.INTEGER },
imdbEpisode: { type: DataTypes.INTEGER },
kitsuId: { type: DataTypes.INTEGER },
kitsuEpisode: { type: DataTypes.INTEGER }
},
{
indexes: [
{
unique: true,
name: 'files_unique_file_constraint',
fields: [
col('infoHash'),
fn('COALESCE', (col('fileIndex')), -1),
fn('COALESCE', (col('imdbId')), 'null'),
fn('COALESCE', (col('imdbSeason')), -1),
fn('COALESCE', (col('imdbEpisode')), -1),
fn('COALESCE', (col('kitsuId')), -1),
fn('COALESCE', (col('kitsuEpisode')), -1)
]
},
{ unique: false, fields: ['imdbId', 'imdbSeason', 'imdbEpisode'] },
{ unique: false, fields: ['kitsuId', 'kitsuEpisode'] }
]
}
);
const Subtitle = database.define('subtitle',
{
infoHash: {
type: DataTypes.STRING(64),
allowNull: false,
references: { model: Torrent, key: 'infoHash' },
onDelete: 'CASCADE'
},
fileIndex: {
type: DataTypes.INTEGER,
allowNull: false
},
fileId: {
type: DataTypes.BIGINT,
allowNull: true,
references: { model: File, key: 'id' },
onDelete: 'SET NULL'
},
title: { type: DataTypes.STRING(512), allowNull: false },
},
{
timestamps: false,
indexes: [
{
unique: true,
name: 'subtitles_unique_subtitle_constraint',
fields: [
col('infoHash'),
col('fileIndex'),
fn('COALESCE', (col('fileId')), -1)
]
},
{ unique: false, fields: ['fileId'] }
]
}
);
const Content = database.define('content',
{
infoHash: {
type: DataTypes.STRING(64),
primaryKey: true,
allowNull: false,
references: { model: Torrent, key: 'infoHash' },
onDelete: 'CASCADE'
},
fileIndex: {
type: DataTypes.INTEGER,
primaryKey: true,
allowNull: false
},
path: { type: DataTypes.STRING(512), allowNull: false },
size: { type: DataTypes.BIGINT },
},
{
timestamps: false,
}
);
const SkipTorrent = database.define('skip_torrent', {
infoHash: { type: DataTypes.STRING(64), primaryKey: true },
});
Torrent.hasMany(File, { foreignKey: 'infoHash', constraints: false });
File.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
Torrent.hasMany(Content, { foreignKey: 'infoHash', constraints: false });
Content.belongsTo(Torrent, { foreignKey: 'infoHash', constraints: false });
File.hasMany(Subtitle, { foreignKey: 'fileId', constraints: false });
Subtitle.belongsTo(File, { foreignKey: 'fileId', constraints: false });
function connect() {
if (process.env.ENABLE_SYNC) {
return database.sync({ alter: true })
.catch(error => {
console.error('Failed syncing database: ', error);
throw error;
});
}
return Promise.resolve();
}
function getProvider(provider) {
return Provider.findOrCreate({ where: { name: { [Op.eq]: provider.name } }, defaults: provider })
.then((result) => result[0])
.catch(() => provider);
}
function getTorrent(torrent) {
const where = torrent.infoHash
? { infoHash: torrent.infoHash }
: { provider: torrent.provider, torrentId: torrent.torrentId }
return Torrent.findOne({ where: where });
}
function getTorrentsBasedOnTitle(titleQuery, type) {
return getTorrentsBasedOnQuery({ title: { [Op.regexp]: `${titleQuery}` }, type: type });
}
function getTorrentsBasedOnQuery(where) {
return Torrent.findAll({ where: where });
}
function getTorrentsWithoutSize() {
return Torrent.findAll({
where: literal(
'exists (select 1 from files where files."infoHash" = torrent."infoHash" and files.size = 300000000)'),
order: [
['seeders', 'DESC']
]
});
}
function getUpdateSeedersTorrents(limit = 50) {
const until = moment().subtract(7, 'days').format('YYYY-MM-DD');
return Torrent.findAll({
where: literal(`torrent."updatedAt" < \'${until}\'`),
limit: limit,
order: [
['seeders', 'DESC'],
['updatedAt', 'ASC']
]
});
}
function getUpdateSeedersNewTorrents(limit = 50) {
const lastUpdate = moment().subtract(12, 'hours').format('YYYY-MM-DD');
const createdAfter = moment().subtract(4, 'days').format('YYYY-MM-DD');
return Torrent.findAll({
where: literal(`torrent."updatedAt" < \'${lastUpdate}\' AND torrent."createdAt" > \'${createdAfter}\'`),
limit: limit,
order: [
['seeders', 'ASC'],
['updatedAt', 'ASC']
]
});
}
function getNoContentsTorrents() {
return Torrent.findAll({
where: { opened: false, seeders: { [Op.gte]: 1 } },
limit: 500,
order: [[fn('RANDOM')]]
});
}
function createTorrent(torrent) {
return Torrent.upsert(torrent)
.then(() => createContents(torrent.infoHash, torrent.contents))
.then(() => createSubtitles(torrent.infoHash, torrent.subtitles));
}
function setTorrentSeeders(torrent, seeders) {
const where = torrent.infoHash
? { infoHash: torrent.infoHash }
: { provider: torrent.provider, torrentId: torrent.torrentId }
return Torrent.update(
{ seeders: seeders },
{ where: where }
);
}
function deleteTorrent(torrent) {
return Torrent.destroy({ where: { infoHash: torrent.infoHash } })
}
function createFile(file) {
if (file.id) {
return (file.dataValues ? file.save() : File.upsert(file))
.then(() => upsertSubtitles(file, file.subtitles));
}
if (file.subtitles && file.subtitles.length) {
file.subtitles = file.subtitles.map(subtitle => ({ infoHash: file.infoHash, title: subtitle.path, ...subtitle }));
}
return File.create(file, { include: [Subtitle], ignoreDuplicates: true });
}
function getFiles(torrent) {
return File.findAll({ where: { infoHash: torrent.infoHash } });
}
function getFilesBasedOnTitle(titleQuery) {
return File.findAll({ where: { title: { [Op.regexp]: `${titleQuery}` } } });
}
function deleteFile(file) {
return File.destroy({ where: { id: file.id } })
}
function createSubtitles(infoHash, subtitles) {
if (subtitles && subtitles.length) {
return Subtitle.bulkCreate(subtitles.map(subtitle => ({ infoHash, title: subtitle.path, ...subtitle })));
}
return Promise.resolve();
}
function upsertSubtitles(file, subtitles) {
if (file.id && subtitles && subtitles.length) {
return Promises.sequence(subtitles
.map(subtitle => {
subtitle.fileId = file.id;
subtitle.infoHash = subtitle.infoHash || file.infoHash;
subtitle.title = subtitle.title || subtitle.path;
return subtitle;
})
.map(subtitle => () => subtitle.dataValues ? subtitle.save() : Subtitle.create(subtitle)));
}
return Promise.resolve();
}
function getSubtitles(torrent) {
return Subtitle.findAll({ where: { infoHash: torrent.infoHash } });
}
function getUnassignedSubtitles() {
return Subtitle.findAll({ where: { fileId: null } });
}
function createContents(infoHash, contents) {
if (contents && contents.length) {
return Content.bulkCreate(contents.map(content => ({ infoHash, ...content })), { ignoreDuplicates: true })
.then(() => Torrent.update({ opened: true }, { where: { infoHash: infoHash }, silent: true }));
}
return Promise.resolve();
}
function getContents(torrent) {
return Content.findAll({ where: { infoHash: torrent.infoHash } });
}
function getSkipTorrent(torrent) {
return SkipTorrent.findByPk(torrent.infoHash)
.then((result) => {
if (!result) {
throw new Error(`torrent not found: ${torrent.infoHash}`);
}
return result.dataValues;
})
}
function createSkipTorrent(torrent) {
return SkipTorrent.upsert({ infoHash: torrent.infoHash });
}
module.exports = {
connect,
getProvider,
createTorrent,
setTorrentSeeders,
getTorrent,
getTorrentsBasedOnTitle,
getTorrentsBasedOnQuery,
deleteTorrent,
getUpdateSeedersTorrents,
getUpdateSeedersNewTorrents,
getNoContentsTorrents,
createFile,
getFiles,
getFilesBasedOnTitle,
deleteFile,
createSubtitles,
upsertSubtitles,
getSubtitles,
getUnassignedSubtitles,
createContents,
getContents,
getSkipTorrent,
createSkipTorrent,
getTorrentsWithoutSize
};

View File

@@ -0,0 +1,21 @@
const UserAgent = require('user-agents');
const userAgent = new UserAgent();
function getRandomUserAgent() {
return userAgent.random().toString();
}
function defaultOptionsWithProxy() {
if (process.env.PROXY_HOST && process.env.PROXY_TYPE) {
return {
proxy: process.env.PROXY_HOST,
headers: {
'user-agent': getRandomUserAgent(),
'proxy-type': process.env.PROXY_TYPE
}
}
}
return { userAgent: getRandomUserAgent() };
}
module.exports = { getRandomUserAgent, defaultOptionsWithProxy };

208
scraper/lib/torrent.js Normal file
View File

@@ -0,0 +1,208 @@
const torrentStream = require('torrent-stream');
const needle = require('needle');
const parseTorrent = require('parse-torrent');
const BTClient = require('bittorrent-tracker')
const async = require('async');
const decode = require('magnet-uri');
const { Type } = require('./types');
const { delay } = require('./promises')
const { isVideo, isSubtitle } = require('./extension');
const { cacheTrackers } = require('./cache');
const TRACKERS_URL = 'https://ngosang.github.io/trackerslist/trackers_all.txt';
const MAX_PEER_CONNECTIONS = process.env.MAX_PEER_CONNECTIONS || 20;
const SEEDS_CHECK_TIMEOUT = 15 * 1000; // 15 secs
const ADDITIONAL_TRACKERS = [
'http://tracker.trackerfix.com:80/announce',
'udp://9.rarbg.me:2780',
'udp://9.rarbg.to:2870'
];
const ANIME_TRACKERS = [
"http://nyaa.tracker.wf:7777/announce",
"udp://anidex.moe:6969/announce",
"udp://tracker-udp.anirena.com:80/announce",
"udp://tracker.uw0.xyz:6969/announce"
];
async function updateCurrentSeeders(torrentsInput) {
return new Promise(async (resolve) => {
const torrents = Array.isArray(torrentsInput) ? torrentsInput : [torrentsInput];
const perTorrentResults = Object.fromEntries(new Map(torrents.map(torrent => [torrent.infoHash, {}])));
const perTrackerInfoHashes = await Promise.all(torrents.map(torrent => getTorrentTrackers(torrent)
.then(torrentTrackers => ({ infoHash: torrent.infoHash, trackers: torrentTrackers }))))
.then(allTorrentTrackers => allTorrentTrackers
.reduce((allTrackersMap, torrentTrackers) => {
torrentTrackers.trackers.forEach(tracker =>
allTrackersMap[tracker] = (allTrackersMap[tracker] || []).concat(torrentTrackers.infoHash));
return allTrackersMap;
}, {}));
let successCounter = 0;
const callback = () => {
console.log(`Total successful tracker responses: ${successCounter}`)
resolve(perTorrentResults);
}
setTimeout(callback, SEEDS_CHECK_TIMEOUT);
async.each(Object.keys(perTrackerInfoHashes), function (tracker, ready) {
BTClient.scrape({ infoHash: perTrackerInfoHashes[tracker], announce: tracker }, (error, response) => {
if (response) {
const results = Array.isArray(torrentsInput) ? Object.entries(response) : [[response.infoHash, response]];
results
.filter(([infoHash]) => perTorrentResults[infoHash])
.forEach(([infoHash, seeders]) =>
perTorrentResults[infoHash][tracker] = [seeders.complete, seeders.incomplete])
successCounter++;
} else if (error) {
perTrackerInfoHashes[tracker]
.filter(infoHash => perTorrentResults[infoHash])
.forEach(infoHash => perTorrentResults[infoHash][tracker] = [0, 0, error.message])
}
ready();
})
}, callback);
}).then(perTorrentResults => {
const torrents = Array.isArray(torrentsInput) ? torrentsInput : [torrentsInput];
torrents.forEach(torrent => {
const results = perTorrentResults[torrent.infoHash];
const newSeeders = Math.max(...Object.values(results).map(values => values[0]).concat(0));
if (torrent.seeders !== newSeeders) {
console.log(`Updating seeders for [${torrent.infoHash}] ${torrent.title} - ${torrent.seeders} -> ${newSeeders}`)
torrent.seeders = newSeeders;
}
})
return torrentsInput;
});
}
async function updateTorrentSize(torrent) {
return filesAndSizeFromTorrentStream(torrent, SEEDS_CHECK_TIMEOUT)
.then(result => {
torrent.size = result.size;
torrent.files = result.files;
return torrent;
});
}
async function sizeAndFiles(torrent) {
return filesAndSizeFromTorrentStream(torrent, 30000);
}
async function torrentFiles(torrent, timeout) {
return getFilesFromObject(torrent)
.catch(() => filesFromTorrentFile(torrent))
.catch(() => filesFromTorrentStream(torrent, timeout))
.then(files => ({
contents: files,
videos: filterVideos(files),
subtitles: filterSubtitles(files)
}));
}
function getFilesFromObject(torrent) {
if (Array.isArray(torrent.files)) {
return Promise.resolve(torrent.files);
}
if (typeof torrent.files === 'function') {
return torrent.files();
}
return Promise.reject("No files in the object");
}
async function filesFromTorrentFile(torrent) {
if (!torrent.torrentLink) {
return Promise.reject(new Error("no torrentLink"));
}
return needle('get', torrent.torrentLink, { open_timeout: 10000 })
.then((response) => {
if (!response.body || response.statusCode !== 200) {
throw new Error('torrent not found')
}
return response.body
})
.then((body) => parseTorrent(body))
.then((info) => info.files.map((file, fileId) => ({
fileIndex: fileId,
name: file.name,
path: file.path.replace(/^[^\/]+\//, ''),
size: file.length
})));
}
async function filesFromTorrentStream(torrent, timeout) {
return filesAndSizeFromTorrentStream(torrent, timeout).then(result => result.files);
}
function filesAndSizeFromTorrentStream(torrent, timeout = 30000) {
if (!torrent.infoHash && !torrent.magnetLink) {
return Promise.reject(new Error("no infoHash or magnetLink"));
}
const magnet = torrent.magnetLink || decode.encode({ infoHash: torrent.infoHash, announce: torrent.trackers });
return new Promise((resolve, rejected) => {
const engine = new torrentStream(magnet, { connections: MAX_PEER_CONNECTIONS });
engine.ready(() => {
const files = engine.files
.map((file, fileId) => ({
fileIndex: fileId,
name: file.name,
path: file.path.replace(/^[^\/]+\//, ''),
size: file.length
}));
const size = engine.torrent.length;
engine.destroy();
resolve({ files, size });
});
setTimeout(() => {
engine.destroy();
rejected(new Error('No available connections for torrent!'));
}, timeout);
});
}
function filterVideos(files) {
if (files.length === 1 && !Number.isInteger(files[0].fileIndex)) {
return files;
}
const videos = files.filter(file => isVideo(file.path));
const maxSize = Math.max(...videos.map(video => video.size));
const minSampleRatio = videos.length <= 3 ? 3 : 10;
const minAnimeExtraRatio = 5;
const minRedundantRatio = videos.length <= 3 ? 30 : Number.MAX_VALUE;
const isSample = video => video.path.match(/sample|bonus|promo/i) && maxSize / parseInt(video.size) > minSampleRatio;
const isRedundant = video => maxSize / parseInt(video.size) > minRedundantRatio;
const isExtra = video => video.path.match(/extras?\//i);
const isAnimeExtra = video => video.path.match(/(?:\b|_)(?:NC)?(?:ED|OP|PV)(?:v?\d\d?)?(?:\b|_)/i)
&& maxSize / parseInt(video.size) > minAnimeExtraRatio;
const isWatermark = video => video.path.match(/^[A-Z-]+(?:\.[A-Z]+)?\.\w{3,4}$/)
&& maxSize / parseInt(video.size) > minAnimeExtraRatio
return videos
.filter(video => !isSample(video))
.filter(video => !isExtra(video))
.filter(video => !isAnimeExtra(video))
.filter(video => !isRedundant(video))
.filter(video => !isWatermark(video));
}
function filterSubtitles(files) {
return files.filter(file => isSubtitle(file.path));
}
async function getTorrentTrackers(torrent) {
const magnetTrackers = torrent.magnetLink && decode(torrent.magnetLink).tr || [];
const torrentTrackers = torrent.trackers && torrent.trackers.split(',') || [];
const defaultTrackers = await getDefaultTrackers(torrent);
return Array.from(new Set([].concat(magnetTrackers).concat(torrentTrackers).concat(defaultTrackers)));
}
async function getDefaultTrackers(torrent, retry = 3) {
return cacheTrackers(() => needle('get', TRACKERS_URL, { open_timeout: SEEDS_CHECK_TIMEOUT })
.then(response => response.body && response.body.trim())
.then(body => body && body.split('\n\n') || []))
.catch(() => retry > 0 ? delay(5000).then(() => getDefaultTrackers(torrent, retry - 1)) : [])
.then(trackers => trackers.concat(ADDITIONAL_TRACKERS))
.then(trackers => torrent.type === Type.ANIME ? trackers.concat(ANIME_TRACKERS) : trackers);
}
module.exports = { updateCurrentSeeders, updateTorrentSize, sizeAndFiles, torrentFiles }

View File

@@ -0,0 +1,182 @@
const { parse } = require('parse-torrent-title');
const { Type } = require('./types');
const Promises = require('./promises');
const repository = require('./repository');
const { getImdbId, getKitsuId } = require('./metadata');
const { parseTorrentFiles } = require('./torrentFiles');
const { assignSubtitles } = require('./torrentSubtitles');
const { isPackTorrent } = require('./parseHelper')
async function createTorrentEntry(torrent, overwrite = false) {
const titleInfo = parse(torrent.title);
if ((titleInfo.seasons || torrent.title.match(/\[\d+-\d+/)) && torrent.type === Type.MOVIE) {
// sometimes series torrent might be put into movies category
torrent.type = Type.SERIES;
}
if (!torrent.imdbId && torrent.type !== Type.ANIME) {
torrent.imdbId = await getImdbId(titleInfo, torrent.type)
.catch(() => undefined);
}
if (torrent.imdbId && torrent.imdbId.length < 9) {
// pad zeros to imdbId if missing
torrent.imdbId = 'tt' + torrent.imdbId.replace('tt', '').padStart(7, '0');
}
if (torrent.imdbId && torrent.imdbId.length > 9 && torrent.imdbId.startsWith('tt0')) {
// sanitize imdbId from redundant zeros
torrent.imdbId = torrent.imdbId.replace(/tt0+([0-9]{7,})$/, 'tt$1');
}
if (!torrent.kitsuId && torrent.type === Type.ANIME) {
torrent.kitsuId = await getKitsuId(titleInfo)
.catch(() => undefined);
}
if (!torrent.imdbId && !torrent.kitsuId && !isPackTorrent(torrent)) {
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
return;
}
const { contents, videos, subtitles } = await parseTorrentFiles(torrent)
.then(torrentContents => overwrite ? overwriteExistingFiles(torrent, torrentContents) : torrentContents)
.then(torrentContents => assignSubtitles(torrentContents))
.catch(error => {
console.log(`Failed getting files for ${torrent.title}`, error.message);
return {};
});
if (!videos || !videos.length) {
console.log(`no video files found for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`);
return;
}
return repository.createTorrent({ ...torrent, contents, subtitles })
.then(() => Promises.sequence(videos.map(video => () => repository.createFile(video))))
.then(() => console.log(`Created ${torrent.provider} entry for [${torrent.infoHash}] ${torrent.title}`));
}
async function overwriteExistingFiles(torrent, torrentContents) {
const videos = torrentContents && torrentContents.videos;
if (videos && videos.length) {
const existingFiles = await repository.getFiles({ infoHash: videos[0].infoHash })
.then((existing) => existing
.reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
map[fileIndex] = (map[fileIndex] || []).concat(next);
return map;
}, {}))
.catch(() => undefined);
if (existingFiles && Object.keys(existingFiles).length) {
const overwrittenVideos = videos
.map(file => {
const mapping = videos.length === 1 && Object.keys(existingFiles).length === 1
? Object.values(existingFiles)[0]
: existingFiles[file.fileIndex !== undefined ? file.fileIndex : null];
if (mapping) {
const originalFile = mapping.shift();
return { id: originalFile.id, ...file };
}
return file;
});
return { ...torrentContents, videos: overwrittenVideos };
}
return torrentContents;
}
return Promise.reject(`No video files found for: ${torrent.title}`);
}
async function createSkipTorrentEntry(torrent) {
return repository.createSkipTorrent(torrent);
}
async function getStoredTorrentEntry(torrent) {
return repository.getSkipTorrent(torrent)
.catch(() => repository.getTorrent(torrent))
.catch(() => undefined);
}
async function checkAndUpdateTorrent(torrent) {
const storedTorrent = torrent.dataValues
? torrent
: await repository.getTorrent(torrent).catch(() => undefined);
if (!storedTorrent) {
return false;
}
if (storedTorrent.provider === 'KickassTorrents' && torrent.provider) {
storedTorrent.provider = torrent.provider;
storedTorrent.torrentId = torrent.torrentId;
}
if (!storedTorrent.languages && torrent.languages && storedTorrent.provider !== 'RARBG') {
storedTorrent.languages = torrent.languages;
storedTorrent.save();
console.log(`Updated [${torrent.infoHash}] ${torrent.title} language to ${torrent.languages}`);
}
return createTorrentContents({ ...storedTorrent.get(), torrentLink: torrent.torrentLink })
.then(() => updateTorrentSeeders(torrent));
}
async function createTorrentContents(torrent) {
if (torrent.opened) {
return;
}
const storedVideos = await repository.getFiles(torrent).catch(() => []);
if (!storedVideos || !storedVideos.length) {
return;
}
const notOpenedVideo = storedVideos.length === 1 && !Number.isInteger(storedVideos[0].fileIndex);
const imdbId = Promises.mostCommonValue(storedVideos.map(stored => stored.imdbId));
const kitsuId = Promises.mostCommonValue(storedVideos.map(stored => stored.kitsuId));
const { contents, videos, subtitles } = await parseTorrentFiles({ ...torrent, imdbId, kitsuId })
.then(torrentContents => notOpenedVideo ? torrentContents : { ...torrentContents, videos: storedVideos })
.then(torrentContents => assignSubtitles(torrentContents))
.catch(error => {
console.log(`Failed getting contents for [${torrent.infoHash}] ${torrent.title}`, error.message);
return {};
});
if (!contents || !contents.length) {
return;
}
if (notOpenedVideo && videos.length === 1) {
// if both have a single video and stored one was not opened, update stored one to true metadata and use that
storedVideos[0].fileIndex = videos[0].fileIndex;
storedVideos[0].title = videos[0].title;
storedVideos[0].size = videos[0].size;
storedVideos[0].subtitles = videos[0].subtitles;
videos[0] = storedVideos[0];
}
// no videos available or more than one new videos were in the torrent
const shouldDeleteOld = notOpenedVideo && videos.every(video => !video.id);
return repository.createTorrent({ ...torrent, contents, subtitles })
.then(() => {
if (shouldDeleteOld) {
console.error(`Deleting old video for [${torrent.infoHash}] ${torrent.title}`)
return storedVideos[0].destroy();
}
return Promise.resolve();
})
.then(() => Promises.sequence(videos.map(video => () => repository.createFile(video))))
.then(() => console.log(`Created contents for ${torrent.provider} [${torrent.infoHash}] ${torrent.title}`))
.catch(error => console.error(`Failed saving contents for [${torrent.infoHash}] ${torrent.title}`, error));
}
async function updateTorrentSeeders(torrent) {
if (!(torrent.infoHash || (torrent.provider && torrent.torrentId)) || !Number.isInteger(torrent.seeders)) {
return torrent;
}
return repository.setTorrentSeeders(torrent, torrent.seeders)
.catch(error => {
console.warn('Failed updating seeders:', error);
return undefined;
});
}
module.exports = {
createTorrentEntry,
createTorrentContents,
createSkipTorrentEntry,
getStoredTorrentEntry,
updateTorrentSeeders,
checkAndUpdateTorrent
};

506
scraper/lib/torrentFiles.js Normal file
View File

@@ -0,0 +1,506 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const distance = require('jaro-winkler');
const { parse } = require('parse-torrent-title');
const Promises = require('../lib/promises');
const { torrentFiles } = require('../lib/torrent');
const { getMetadata, getImdbId, getKitsuId } = require('../lib/metadata');
const { parseSeriesVideos, isPackTorrent } = require('../lib/parseHelper');
const { Type } = require('./types');
const { isDisk } = require('./extension');
const MIN_SIZE = 5 * 1024 * 1024; // 5 MB
const imdb_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 1000 });
async function parseTorrentFiles(torrent) {
const parsedTorrentName = parse(torrent.title);
const metadata = await getMetadata(torrent.kitsuId || torrent.imdbId, torrent.type || Type.MOVIE)
.then(meta => Object.assign({}, meta))
.catch(() => undefined);
// if (metadata && metadata.type !== torrent.type && torrent.type !== Type.ANIME) {
// throw new Error(`Mismatching entry type for ${torrent.name}: ${torrent.type}!=${metadata.type}`);
// }
if (torrent.type !== Type.ANIME && metadata && metadata.type && metadata.type !== torrent.type) {
// it's actually a movie/series
torrent.type = metadata.type;
}
if (torrent.type === Type.MOVIE && (!parsedTorrentName.seasons ||
parsedTorrentName.season === 5 && [1, 5].includes(parsedTorrentName.episode))) {
return parseMovieFiles(torrent, parsedTorrentName, metadata);
}
return parseSeriesFiles(torrent, parsedTorrentName, metadata)
}
async function parseMovieFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getMoviesTorrentContent(torrent);
const filteredVideos = videos
.filter(video => video.size > MIN_SIZE)
.filter(video => !isFeaturette(video));
if (isSingleMovie(filteredVideos)) {
const parsedVideos = filteredVideos.map(video => ({
infoHash: torrent.infoHash,
fileIndex: video.fileIndex,
title: video.path || torrent.title,
size: video.size || torrent.size,
imdbId: torrent.imdbId || metadata && metadata.imdbId,
kitsuId: torrent.kitsuId || metadata && metadata.kitsuId
}));
return { contents, videos: parsedVideos, subtitles };
}
const parsedVideos = await Promises.sequence(filteredVideos.map(video => () => isFeaturette(video)
? Promise.resolve(video)
: findMovieImdbId(video.name).then(imdbId => ({ ...video, imdbId }))))
.then(videos => videos.map(video => ({
infoHash: torrent.infoHash,
fileIndex: video.fileIndex,
title: video.path || video.name,
size: video.size,
imdbId: video.imdbId,
})));
return { contents, videos: parsedVideos, subtitles };
}
async function parseSeriesFiles(torrent, parsedName, metadata) {
const { contents, videos, subtitles } = await getSeriesTorrentContent(torrent);
const parsedVideos = await Promise.resolve(videos)
.then(videos => videos.filter(video => videos.length === 1 || video.size > MIN_SIZE))
.then(videos => parseSeriesVideos(torrent, videos))
.then(videos => decomposeEpisodes(torrent, videos, metadata))
.then(videos => assignKitsuOrImdbEpisodes(torrent, videos, metadata))
.then(videos => Promise.all(videos.map(video => video.isMovie
? mapSeriesMovie(video, torrent)
: mapSeriesEpisode(video, torrent, videos))))
.then(videos => videos
.reduce((a, b) => a.concat(b), [])
.map(video => isFeaturette(video) ? clearInfoFields(video) : video))
return { contents, videos: parsedVideos, subtitles };
}
async function getMoviesTorrentContent(torrent) {
const files = await torrentFiles(torrent)
.catch(error => {
if (!isPackTorrent(torrent)) {
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}
return Promise.reject(error);
});
if (files.contents && files.contents.length && !files.videos.length && isDiskTorrent(files.contents)) {
files.videos = [{ name: torrent.title, path: torrent.title, size: torrent.size }];
}
return files;
}
async function getSeriesTorrentContent(torrent) {
return torrentFiles(torrent)
.catch(error => {
if (!isPackTorrent(torrent)) {
return { videos: [{ name: torrent.title, path: torrent.title, size: torrent.size }] }
}
return Promise.reject(error);
});
}
async function mapSeriesEpisode(file, torrent, files) {
if (!file.episodes && !file.kitsuEpisodes) {
if (files.some(otherFile => otherFile.episodes || otherFile.kitsuEpisodes) || parse(torrent.title).seasons) {
return Promise.resolve({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: torrent.imdbId || file.imdbId,
});
}
return Promise.resolve([]);
}
const episodeIndexes = [...(file.episodes || file.kitsuEpisodes).keys()];
return Promise.resolve(episodeIndexes.map((index) => ({
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: file.imdbId || torrent.imdbId,
imdbSeason: file.season,
imdbEpisode: file.episodes && file.episodes[index],
kitsuId: file.kitsuId || torrent.kitsuId,
kitsuEpisode: file.kitsuEpisodes && file.kitsuEpisodes[index]
})))
}
async function mapSeriesMovie(file, torrent) {
const kitsuId = torrent.type === Type.ANIME ? await findMovieKitsuId(file) : undefined;
const imdbId = !kitsuId ? await findMovieImdbId(file) : undefined;
const metadata = await getMetadata(kitsuId || imdbId, Type.MOVIE).catch(() => ({}));
const hasEpisode = metadata.videos && metadata.videos.length && (file.episode || metadata.videos.length === 1);
const episodeVideo = hasEpisode && metadata.videos[(file.episode || 1) - 1];
return [{
infoHash: torrent.infoHash,
fileIndex: file.fileIndex,
title: file.path || file.name,
size: file.size,
imdbId: metadata.imdbId || imdbId,
kitsuId: metadata.kitsuId || kitsuId,
imdbSeason: episodeVideo && metadata.imdbId ? episodeVideo.imdbSeason : undefined,
imdbEpisode: episodeVideo && metadata.imdbId ? episodeVideo.imdbEpisode || episodeVideo.episode : undefined,
kitsuEpisode: episodeVideo && metadata.kitsuId ? episodeVideo.kitsuEpisode || episodeVideo.episode : undefined
}];
}
async function decomposeEpisodes(torrent, files, metadata = { episodeCount: [] }) {
if (files.every(file => !file.episodes && !file.date)) {
return files;
}
preprocessEpisodes(files);
if (torrent.type === Type.ANIME && torrent.kitsuId) {
if (needsCinemetaMetadataForAnime(files, metadata)) {
// In some cases anime could be resolved to wrong kitsuId
// because of imdb season naming/absolute per series naming/multiple seasons
// So in these cases we need to fetch cinemeta based metadata and decompose episodes using that
await updateToCinemetaMetadata(metadata);
if (files.some(file => Number.isInteger(file.season))) {
// sometimes multi season anime torrents don't include season 1 naming
files
.filter(file => !Number.isInteger(file.season) && file.episodes)
.forEach(file => file.season = 1);
}
} else {
// otherwise for anime type episodes are always absolute and for a single season
files
.filter(file => file.episodes && file.season !== 0)
.forEach(file => file.season = 1);
return files;
}
}
const sortedEpisodes = files
.map(file => !file.isMovie && file.episodes || [])
.reduce((a, b) => a.concat(b), [])
.sort((a, b) => a - b);
if (isConcatSeasonAndEpisodeFiles(files, sortedEpisodes, metadata)) {
decomposeConcatSeasonAndEpisodeFiles(torrent, files, metadata);
} else if (isDateEpisodeFiles(files, metadata)) {
decomposeDateEpisodeFiles(torrent, files, metadata);
} else if (isAbsoluteEpisodeFiles(files, metadata)) {
decomposeAbsoluteEpisodeFiles(torrent, files, metadata);
}
// decomposeEpisodeTitleFiles(torrent, files, metadata);
return files;
}
function preprocessEpisodes(files) {
// reverse special episode naming when they named with 0 episode, ie. S02E00
files
.filter(file => Number.isInteger(file.season) && file.episode === 0)
.forEach(file => {
file.episode = file.season
file.episodes = [file.season]
file.season = 0;
})
}
function isConcatSeasonAndEpisodeFiles(files, sortedEpisodes, metadata) {
if (metadata.kitsuId !== undefined) {
// anime does not use this naming scheme in 99% of cases;
return false;
}
// decompose concat season and episode files (ex. 101=S01E01) in case:
// 1. file has a season, but individual files are concatenated with that season (ex. path Season 5/511 - Prize
// Fighters.avi)
// 2. file does not have a season and the episode does not go out of range for the concat season
// episode count
const thresholdAbove = Math.max(Math.ceil(files.length * 0.05), 5);
const thresholdSorted = Math.max(Math.ceil(files.length * 0.8), 8);
const threshold = Math.max(Math.ceil(files.length * 0.8), 5);
const sortedConcatEpisodes = sortedEpisodes
.filter(ep => ep > 100)
.filter(ep => metadata.episodeCount[div100(ep) - 1] < ep)
.filter(ep => metadata.episodeCount[div100(ep) - 1] >= mod100(ep));
const concatFileEpisodes = files
.filter(file => !file.isMovie && file.episodes)
.filter(file => !file.season || file.episodes.every(ep => div100(ep) === file.season));
const concatAboveTotalEpisodeCount = files
.filter(file => !file.isMovie && file.episodes && file.episodes.every(ep => ep > 100))
.filter(file => file.episodes.every(ep => ep > metadata.totalCount));
return sortedConcatEpisodes.length >= thresholdSorted && concatFileEpisodes.length >= threshold
|| concatAboveTotalEpisodeCount.length >= thresholdAbove;
}
function isDateEpisodeFiles(files, metadata) {
return files.every(file => (!file.season || !metadata.episodeCount[file.season - 1]) && file.date);
}
function isAbsoluteEpisodeFiles(files, metadata) {
const threshold = Math.ceil(files.length / 5);
const nonMovieEpisodes = files
.filter(file => !file.isMovie && file.episodes);
const absoluteEpisodes = files
.filter(file => file.season && file.episodes)
.filter(file => file.episodes.every(ep => metadata.episodeCount[file.season - 1] < ep))
return nonMovieEpisodes.every(file => !file.season || file.season > metadata.episodeCount.length)
|| absoluteEpisodes.length >= threshold
// && !isNewEpisodesNotInMetadata(files, metadata);
}
function isNewEpisodesNotInMetadata(files, metadata) {
// new episode might not yet been indexed by cinemeta.
// detect this if episode number is larger than the last episode or season is larger than the last one
return files.length === 1
&& /continuing|current/i.test(metadata.status)
&& files.filter(file => !file.isMovie && file.episodes)
.every(file => file.season >= metadata.episodeCount.length
&& file.episodes.every(ep => ep > metadata.episodeCount[file.season - 1]))
}
function decomposeConcatSeasonAndEpisodeFiles(torrent, files, metadata) {
files
.filter(file => file.episodes && file.season !== 0 && file.episodes.every(ep => ep > 100))
.filter(file => metadata.episodeCount[(file.season || div100(file.episodes[0])) - 1] < 100)
.filter(file => file.season && file.episodes.every(ep => div100(ep) === file.season) || !file.season)
.forEach(file => {
file.season = div100(file.episodes[0]);
file.episodes = file.episodes.map(ep => mod100(ep))
});
}
function decomposeAbsoluteEpisodeFiles(torrent, files, metadata) {
if (metadata.episodeCount.length === 0) {
files
.filter(file => !Number.isInteger(file.season) && file.episodes && !file.isMovie)
.forEach(file => {
file.season = 1;
});
return;
}
files
.filter(file => file.episodes && !file.isMovie && file.season !== 0)
.filter(file => !file.season || (metadata.episodeCount[file.season - 1] || 0) < file.episodes[0])
.forEach(file => {
const seasonIdx = ([...metadata.episodeCount.keys()]
.find((i) => metadata.episodeCount.slice(0, i + 1).reduce((a, b) => a + b) >= file.episodes[0])
+ 1 || metadata.episodeCount.length) - 1;
file.season = seasonIdx + 1;
file.episodes = file.episodes
.map(ep => ep - metadata.episodeCount.slice(0, seasonIdx).reduce((a, b) => a + b, 0))
});
}
function decomposeDateEpisodeFiles(torrent, files, metadata) {
if (!metadata || !metadata.videos || !metadata.videos.length) {
return;
}
const timeZoneOffset = getTimeZoneOffset(metadata.country);
const offsetVideos = metadata.videos
.reduce((map, video) => {
const releaseDate = moment(video.released).utcOffset(timeZoneOffset).format('YYYY-MM-DD');
map[releaseDate] = video;
return map;
}, {});
files
.filter(file => file.date)
.forEach(file => {
const video = offsetVideos[file.date];
if (video) {
file.season = video.season;
file.episodes = [video.episode];
}
});
}
function decomposeEpisodeTitleFiles(torrent, files, metadata) {
files
// .filter(file => !file.season)
.map(file => {
const episodeTitle = file.name.replace('_', ' ')
.replace(/^.*(?:E\d+[abc]?|- )\s?(.+)\.\w{1,4}$/, '$1')
.trim();
const foundEpisode = metadata.videos
.map(video => ({ ...video, distance: distance(episodeTitle, video.name) }))
.sort((a, b) => b.distance - a.distance)[0];
if (foundEpisode) {
file.isMovie = false;
file.season = foundEpisode.season;
file.episodes = [foundEpisode.episode];
}
})
}
function getTimeZoneOffset(country) {
switch (country) {
case 'United States':
case 'USA':
return '-08:00';
default:
return '00:00';
}
}
function assignKitsuOrImdbEpisodes(torrent, files, metadata) {
if (!metadata || !metadata.videos || !metadata.videos.length) {
if (torrent.type === Type.ANIME) {
// assign episodes as kitsu episodes for anime when no metadata available for imdb mapping
files
.filter(file => file.season && file.episodes)
.forEach(file => {
file.kitsuEpisodes = file.episodes;
file.season = undefined;
file.episodes = undefined;
})
if (metadata.type === Type.MOVIE && files.every(file => !file.imdbId)) {
// sometimes a movie has episode naming, thus not recognized as a movie and imdbId not assigned
files.forEach(file => file.imdbId = metadata.imdbId);
}
}
return files;
}
const seriesMapping = metadata.videos
.reduce((map, video) => {
const episodeMap = map[video.season] || {};
episodeMap[video.episode] = video;
map[video.season] = episodeMap;
return map;
}, {});
if (metadata.videos.some(video => Number.isInteger(video.imdbSeason)) || !metadata.imdbId) {
// kitsu episode info is the base
files
.filter(file => Number.isInteger(file.season) && file.episodes)
.map(file => {
const seasonMapping = seriesMapping[file.season];
const episodeMapping = seasonMapping && seasonMapping[file.episodes[0]];
file.kitsuEpisodes = file.episodes;
if (episodeMapping && Number.isInteger(episodeMapping.imdbSeason)) {
file.imdbId = metadata.imdbId;
file.season = episodeMapping.imdbSeason;
file.episodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].imdbEpisode);
} else {
// no imdb mapping available for episode
file.season = undefined;
file.episodes = undefined;
}
});
} else if (metadata.videos.some(video => video.kitsuEpisode)) {
// imdb episode info is base
files
.filter(file => Number.isInteger(file.season) && file.episodes)
.forEach(file => {
if (seriesMapping[file.season]) {
const seasonMapping = seriesMapping[file.season];
file.imdbId = metadata.imdbId;
file.kitsuId = seasonMapping[file.episodes[0]] && seasonMapping[file.episodes[0]].kitsuId;
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
} else if (seriesMapping[file.season - 1]) {
// sometimes a second season might be a continuation of the previous season
const seasonMapping = seriesMapping[file.season - 1];
const episodes = Object.values(seasonMapping);
const firstKitsuId = episodes.length && episodes[0].kitsuId;
const differentTitlesCount = new Set(episodes.map(ep => ep.kitsuId)).size
const skippedCount = episodes.filter(ep => ep.kitsuId === firstKitsuId).length;
const seasonEpisodes = files
.filter(otherFile => otherFile.season === file.season)
.reduce((a, b) => a.concat(b.episodes), []);
const isAbsoluteOrder = seasonEpisodes.every(ep => ep > skippedCount && ep <= episodes.length)
const isNormalOrder = seasonEpisodes.every(ep => ep + skippedCount <= episodes.length)
if (differentTitlesCount >= 1 && (isAbsoluteOrder || isNormalOrder)) {
file.imdbId = metadata.imdbId;
file.season = file.season - 1;
file.episodes = file.episodes.map(ep => isAbsoluteOrder ? ep : ep + skippedCount);
file.kitsuId = seasonMapping[file.episodes[0]].kitsuId;
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
}
} else if (Object.values(seriesMapping).length === 1 && seriesMapping[1]) {
// sometimes series might be named with sequel season but it's not a season on imdb and a new title
const seasonMapping = seriesMapping[1];
file.imdbId = metadata.imdbId;
file.season = 1;
file.kitsuId = seasonMapping[file.episodes[0]].kitsuId;
file.kitsuEpisodes = file.episodes.map(ep => seasonMapping[ep] && seasonMapping[ep].kitsuEpisode);
}
});
}
return files;
}
function needsCinemetaMetadataForAnime(files, metadata) {
if (!metadata || !metadata.imdbId || !metadata.videos || !metadata.videos.length) {
return false;
}
const minSeason = Math.min(...metadata.videos.map(video => video.imdbSeason)) || Number.MAX_VALUE;
const maxSeason = Math.max(...metadata.videos.map(video => video.imdbSeason)) || Number.MAX_VALUE;
const differentSeasons = new Set(metadata.videos
.map(video => video.imdbSeason)
.filter(season => Number.isInteger(season))).size;
const total = metadata.totalCount || Number.MAX_VALUE;
return differentSeasons > 1 || files
.filter(file => !file.isMovie && file.episodes)
.some(file => file.season < minSeason || file.season > maxSeason || file.episodes.every(ep => ep > total));
}
async function updateToCinemetaMetadata(metadata) {
return getMetadata(metadata.imdbId, metadata.type)
.then(newMetadata => !newMetadata.videos || !newMetadata.videos.length ? metadata : newMetadata)
.then(newMetadata => {
metadata.videos = newMetadata.videos;
metadata.episodeCount = newMetadata.episodeCount;
metadata.totalCount = newMetadata.totalCount;
return metadata;
})
.catch(error => console.warn(`Failed ${metadata.imdbId} metadata cinemeta update due: ${error.message}`));
}
function findMovieImdbId(title) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return imdb_limiter.schedule(() => getImdbId(parsedTitle, Type.MOVIE).catch(() => undefined));
}
function findMovieKitsuId(title) {
const parsedTitle = typeof title === 'string' ? parse(title) : title;
return getKitsuId(parsedTitle, Type.MOVIE).catch(() => undefined);
}
function isDiskTorrent(contents) {
return contents.some(content => isDisk(content.path));
}
function isSingleMovie(videos) {
return videos.length === 1 ||
(videos.length === 2 &&
videos.find(v => /\b(?:part|disc|cd)[ ._-]?0?1\b|^0?1\.\w{2,4}$/i.test(v.path)) &&
videos.find(v => /\b(?:part|disc|cd)[ ._-]?0?2\b|^0?2\.\w{2,4}$/i.test(v.path)));
}
function isFeaturette(video) {
return /featurettes?\/|extras-grym/i.test(video.path);
}
function clearInfoFields(video) {
video.imdbId = undefined;
video.imdbSeason = undefined;
video.imdbEpisode = undefined;
video.kitsuId = undefined;
video.kitsuEpisode = undefined;
return video;
}
function div100(episode) {
return (episode / 100 >> 0); // floor to nearest int
}
function mod100(episode) {
return episode % 100;
}
module.exports = { parseTorrentFiles };

View File

@@ -0,0 +1,91 @@
const { parse } = require('parse-torrent-title');
function assignSubtitles({ contents, videos, subtitles }) {
if (videos && videos.length && subtitles && subtitles.length) {
if (videos.length === 1) {
videos[0].subtitles = subtitles;
return { contents, videos, subtitles: [] };
}
const parsedVideos = videos
.map(video => _parseVideo(video));
const assignedSubs = subtitles
.map(subtitle => ({ subtitle, videos: _mostProbableSubtitleVideos(subtitle, parsedVideos) }));
const unassignedSubs = assignedSubs
.filter(assignedSub => !assignedSub.videos)
.map(assignedSub => assignedSub.subtitle);
assignedSubs
.filter(assignedSub => assignedSub.videos)
.forEach(assignedSub => assignedSub.videos
.forEach(video => video.subtitles = (video.subtitles || []).concat(assignedSub.subtitle)));
return { contents, videos, subtitles: unassignedSubs };
}
return { contents, videos, subtitles };
}
function _parseVideo(video) {
const fileName = video.title.split('/').pop().replace(/\.(\w{2,4})$/, '');
const folderName = video.title.replace(/\/?[^/]+$/, '');
return {
videoFile: video,
fileName: fileName,
folderName: folderName,
...parseFilename(video.title)
};
}
function _mostProbableSubtitleVideos(subtitle, parsedVideos) {
const subTitle = (subtitle.title || subtitle.path).split('/').pop().replace(/\.(\w{2,4})$/, '');
const parsedSub = parsePath(subtitle.title || subtitle.path);
const byFileName = parsedVideos.filter(video => subTitle.includes(video.fileName));
if (byFileName.length === 1) {
return byFileName.map(v => v.videoFile);
}
const byTitleSeasonEpisode = parsedVideos.filter(video => video.title === parsedSub.title
&& arrayEquals(video.seasons, parsedSub.seasons)
&& arrayEquals(video.episodes, parsedSub.episodes));
if (singleVideoFile(byTitleSeasonEpisode)) {
return byTitleSeasonEpisode.map(v => v.videoFile);
}
const bySeasonEpisode = parsedVideos.filter(video => arrayEquals(video.seasons, parsedSub.seasons)
&& arrayEquals(video.episodes, parsedSub.episodes));
if (singleVideoFile(bySeasonEpisode)) {
return bySeasonEpisode.map(v => v.videoFile);
}
const byTitle = parsedVideos.filter(video => video.title && video.title === parsedSub.title);
if (singleVideoFile(byTitle)) {
return byTitle.map(v => v.videoFile);
}
const byEpisode = parsedVideos.filter(video => arrayEquals(video.episodes, parsedSub.episodes));
if (singleVideoFile(byEpisode)) {
return byEpisode.map(v => v.videoFile);
}
return undefined;
}
function singleVideoFile(videos) {
return new Set(videos.map(v => v.videoFile.fileIndex)).size === 1;
}
function parsePath(path) {
const pathParts = path.split('/').map(part => parseFilename(part));
const parsedWithEpisode = pathParts.find(parsed => parsed.season && parsed.episodes);
return parsedWithEpisode || pathParts[pathParts.length - 1];
}
function parseFilename(filename) {
const parsedInfo = parse(filename)
const titleEpisode = parsedInfo.title.match(/(\d+)$/);
if (!parsedInfo.episodes && titleEpisode) {
parsedInfo.episodes = [parseInt(titleEpisode[1], 10)];
}
return parsedInfo;
}
function arrayEquals(array1, array2) {
if (!array1 || !array2) return array1 === array2;
return array1.length === array2.length && array1.every((value, index) => value === array2[index])
}
module.exports = { assignSubtitles }

5
scraper/lib/types.js Normal file
View File

@@ -0,0 +1,5 @@
exports.Type = {
MOVIE: 'movie',
SERIES: 'series',
ANIME: 'anime'
};

145
scraper/manual/manual.js Normal file
View File

@@ -0,0 +1,145 @@
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const Promises = require('../lib/promises');
const { mostCommonValue } = require('../lib/promises');
const repository = require('../lib/repository');
const { getImdbId, getKitsuId } = require('../lib/metadata');
const { parseTorrentFiles } = require('../lib/torrentFiles');
const { createTorrentContents } = require('../lib/torrentEntries');
const { assignSubtitles } = require('../lib/torrentSubtitles');
const { Type } = require('../lib/types');
const limiter = new Bottleneck({ maxConcurrent: 40 });
async function updateMovieCollections() {
const collectionFiles = await repository.getFilesBasedOnTitle('logy')
.then(files => files.filter(file => file.fileIndex === null))
.then(files => files.filter(file => parse(file.title).complete));
collectionFiles.map(original => repository.getTorrent({ infoHash: original.infoHash })
.then(torrent => parseTorrentFiles({ ...torrent.get(), imdbId: original.imdbId }))
.then(files => Promise.all(files.map(file => {
console.log(file);
return repository.createFile(file)
})))
.then(createdFiled => {
if (createdFiled && createdFiled.length) {
console.log(`Updated movie collection ${original.title}`);
repository.deleteFile(original)
} else {
console.log(`Failed updating movie collection ${original.title}`);
}
}));
}
async function reapplySeriesSeasonsSavedAsMovies() {
return repository.getTorrentsBasedOnTitle('(?:[^a-zA-Z0-9]|^)[Ss][012]?[0-9](?:[^0-9]|$)', Type.MOVIE)
.then(torrents => Promise.all(torrents
.filter(torrent => parse(torrent.title).seasons)
.map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, false)
.then(() => {
torrent.type = Type.SERIES;
return torrent.save();
})))))
.then(() => console.log('Finished updating multiple torrents'));
}
async function reapplyDecomposingToTorrentsOnRegex(regex) {
return repository.getTorrentsBasedOnTitle(regex, Type.ANIME)
.then(torrents => Promise.all(torrents
.map(torrent => limiter.schedule(() => reapplyEpisodeDecomposing(torrent.infoHash, true)))))
.then(() => console.log('Finished updating multiple torrents'));
}
async function reapplyEpisodeDecomposing(infoHash, includeSourceFiles = true) {
const torrent = await repository.getTorrent({ infoHash });
const storedFiles = await repository.getFiles({ infoHash });
const fileIndexMap = storedFiles
.reduce((map, next) => {
const fileIndex = next.fileIndex !== undefined ? next.fileIndex : null;
map[fileIndex] = (map[fileIndex] || []).concat(next);
return map;
}, {});
const files = includeSourceFiles && Object.values(fileIndexMap)
.map(sameIndexFiles => sameIndexFiles[0])
.map(file => ({
fileIndex: file.fileIndex,
name: file.title.replace(/.*\//, ''),
path: file.title,
size: file.size
}));
const kitsuId = undefined;
const imdbId = kitsuId
? undefined
: mostCommonValue(storedFiles.map(file => file.imdbId))
|| await getImdbId(parse(torrent.title)).catch(() => undefined);
if (!imdbId && !kitsuId) {
console.log(`imdbId or kitsuId not found: ${torrent.provider} ${torrent.title}`);
return Promise.resolve();
}
return parseTorrentFiles({ ...torrent.get(), imdbId, kitsuId, files })
.then(torrentContents => torrentContents.videos)
.then(newFiles => newFiles.map(file => {
const fileIndex = file.fileIndex !== undefined ? file.fileIndex : null;
const mapping = fileIndexMap[fileIndex];
if (mapping) {
const originalFile = mapping.shift();
if (originalFile) {
if (!originalFile.imdbId) {
originalFile.imdbId = file.imdbId
}
originalFile.imdbSeason = file.imdbSeason;
originalFile.imdbEpisode = file.imdbEpisode;
originalFile.kitsuId = file.kitsuId;
originalFile.kitsuEpisode = file.kitsuEpisode;
return originalFile;
}
}
return file;
}))
.then(updatedFiles => Promise.all(updatedFiles
.map(file => file.id ? file.save() : repository.createFile(file))))
.then(() => console.log(`Updated files for [${torrent.infoHash}] ${torrent.title}`));
}
async function assignSubs() {
const unassignedSubs = await repository.getUnassignedSubtitles()
.then(subs => subs.reduce((map, sub) => {
map[sub.infoHash] = (map[sub.infoHash] || []).concat(sub);
return map;
}, {}));
const infoHashes = Object.keys(unassignedSubs);
return Promise.all(infoHashes.map(async infoHash => {
const videos = await repository.getFiles({ infoHash });
const subtitles = unassignedSubs[infoHash];
const assignedContents = assignSubtitles({ videos, subtitles });
return Promise.all(assignedContents.videos
.filter(video => video.subtitles)
.map(video => repository.upsertSubtitles(video, video.subtitles)));
}));
}
async function openTorrentContents() {
const limiter = new Bottleneck({ maxConcurrent: 15 });
const unopenedTorrents = await repository.getNoContentsTorrents();
return Promise.all(unopenedTorrents.map(torrent => limiter.schedule(() => createTorrentContents(torrent))))
.then(() => unopenedTorrents.length === 500 ? openTorrentContents() : Promise.resolve)
}
// const infoHashes = [
// ]
// Promises.sequence(infoHashes.map(infoHash => () => reapplyEpisodeDecomposing(infoHash)))
// .then(() => console.log('Finished'));
//findAllFiles().then(() => console.log('Finished'));
//updateMovieCollections().then(() => console.log('Finished'));
reapplyEpisodeDecomposing('96cc18f564f058384c18b4966a183d81808ce3fb', true).then(() => console.log('Finished'));
//reapplySeriesSeasonsSavedAsMovies().then(() => console.log('Finished'));
//reapplyDecomposingToTorrentsOnRegex('.*Title.*').then(() => console.log('Finished'));
//reapplyManualHashes().then(() => console.log('Finished'));
// assignSubs().then(() => console.log('Finished'));
// openTorrentContents().then(() => console.log('Finished'));

4269
scraper/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

49
scraper/package.json Normal file
View File

@@ -0,0 +1,49 @@
{
"name": "stremio-torrentio",
"version": "1.0.14",
"exports": "./index.js",
"type": "commonjs",
"scripts": {
"start": "node index.js"
},
"author": "TheBeastLT <pauliox@beyond.lt>",
"license": "MIT",
"dependencies": {
"@putdotio/api-client": "^8.42.0",
"all-debrid-api": "^1.1.0",
"axios": "^1.6.1",
"bottleneck": "^2.19.5",
"cache-manager": "^3.4.4",
"cache-manager-mongodb": "^0.3.0",
"cors": "^2.8.5",
"debrid-link-api": "^1.0.1",
"express-rate-limit": "^6.7.0",
"google-it": "^1.6.4",
"google-sr": "^3.2.1",
"he": "^1.2.0",
"jaro-winkler": "^0.2.8",
"magnet-uri": "^6.2.0",
"name-to-imdb": "^3.0.4",
"named-queue": "^2.2.1",
"needle": "^3.3.1",
"node-schedule": "^2.1.1",
"nodejs-bing": "^0.1.0",
"nyaapi": "^2.4.4",
"offcloud-api": "^1.0.2",
"parse-torrent-title": "git://github.com/TheBeastLT/parse-torrent-title.git#022408972c2a040f846331a912a6a8487746a654",
"pg": "^8.11.3",
"pg-hstore": "^2.3.4",
"premiumize-api": "^1.0.3",
"prom-client": "^12.0.0",
"real-debrid-api": "git://github.com/TheBeastLT/node-real-debrid.git#d1f7eaa8593b947edbfbc8a92a176448b48ef445",
"request-ip": "^3.3.0",
"router": "^1.3.8",
"sequelize": "^6.31.1",
"stremio-addon-sdk": "^1.6.10",
"sugar-date": "^2.0.6",
"swagger-stats": "^0.99.7",
"torrent-stream": "^1.2.1",
"ua-parser-js": "^1.0.36",
"user-agents": "^1.0.1444"
}
}

View File

@@ -0,0 +1,14 @@
const { scheduleScraping, scrapeAll } = require('./scraper')
const { scheduleUpdateSeeders, scheduleUpdateSeedersForNewTorrents } = require('./seeders')
function startScraper() {
if (process.env.ENABLE_SCHEDULING) {
scheduleScraping();
scheduleUpdateSeeders();
scheduleUpdateSeedersForNewTorrents();
} else {
scrapeAll()
}
}
module.exports = { startScraper }

View File

@@ -0,0 +1,28 @@
const schedule = require('node-schedule');
const scrapers = require('./scrapers');
const { sequence } = require('../lib/promises')
function scheduleScraping() {
const allCrons = scrapers.reduce((crons, provider) => {
crons[provider.cron] = (crons[provider.cron] || []).concat(provider)
return crons;
}, {});
Object.entries(allCrons).forEach(([cron, providers]) => schedule.scheduleJob(cron, () => _scrapeProviders(providers)))
}
function scrapeAll() {
return _scrapeProviders(scrapers)
}
async function _scrapeProviders(providers) {
return sequence(providers.map(provider => () => _singleScrape(provider)));
}
async function _singleScrape(provider) {
return provider.scraper.scrape().catch(error => {
console.warn(`Failed ${provider.name} scraping due: `, error);
return Promise.resolve()
})
}
module.exports = { scheduleScraping, scrapeAll }

View File

@@ -0,0 +1,45 @@
const thepiratebayScraper = require('../scrapers/thepiratebay/thepiratebay_scraper');
const thepiratebayFakeRemoval = require('../scrapers/thepiratebay/thepiratebay_fakes_removal');
const ytsScraper = require('../scrapers/yts/yts_scraper');
const eztvScraper = require('../scrapers/eztv/eztv_scraper');
const leetxScraper = require('../scrapers/1337x/1337x_scraper');
const kickassScraper = require('../scrapers/kickass/kickass_scraper');
const rarbgScraper = require('../scrapers/rarbg/rarbg_scraper');
const nyaaPantsuScraper = require('../scrapers/nyaapantsu/nyaa_pantsu_scraper');
const nyaaSiScraper = require('../scrapers/nyaasi/nyaa_si_scraper');
const erairawsScraper = require('../scrapers/erairaws/erairaws_scraper');
const torrentGalaxyScraper = require('../scrapers/torrentgalaxy/torrentgalaxy_scraper');
const rutorScraper = require('../scrapers/rutor/rutor_scraper');
const Comando = require('../scrapers/comando/comando_scraper')
const ComoEuBaixo = require('../scrapers/comoeubaixo/comoeubaixo_scraper')
const Lapumia = require('../scrapers/lapumia/lapumia_scraper')
const OndeBaixa = require('../scrapers/ondebaixa/ondebaixa_scraper');
const AnimesTorrent = require('../scrapers/animestorrent/animestorrent_scraper')
const DarkMahou = require('../scrapers/darkmahou/darkmahou_scraper')
module.exports = [
{ scraper: ytsScraper, name: ytsScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: eztvScraper, name: eztvScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: nyaaSiScraper, name: nyaaSiScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: nyaaPantsuScraper, name: nyaaPantsuScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: rarbgScraper, name: rarbgScraper.NAME, cron: '0 0 */1 ? * *' },
{ scraper: rutorScraper, name: rutorScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: thepiratebayScraper, name: thepiratebayScraper.NAME, cron: '0 0 */2 ? * *' },
{ scraper: thepiratebayFakeRemoval, name: thepiratebayFakeRemoval.NAME, cron: '0 0 */12 ? * *' },
{ scraper: torrentGalaxyScraper, name: torrentGalaxyScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: leetxScraper, name: leetxScraper.NAME, cron: '0 0 */4 ? * *' },
// { scraper: kickassScraper, name: kickassScraper.NAME, cron: '0 0 */4 ? * *' },
{ scraper: Comando, name: Comando.NAME, cron: '0 0 */4 ? * *' },
{ scraper: ComoEuBaixo, name: ComoEuBaixo.NAME, cron: '0 0 */4 ? * *' },
{ scraper: Lapumia, name: Lapumia.NAME, cron: '0 0 */4 ? * *' },
{ scraper: OndeBaixa, name: OndeBaixa.NAME, cron: '0 0 */4 ? * *' },
// { scraper: AnimesTorrent, name: AnimesTorrent.NAME, cron: '0 0 */4 ? * *' },
// { scraper: DarkMahou, name: DarkMahou.NAME, cron: '0 0 */4 ? * *' },
{ scraper: erairawsScraper, name: erairawsScraper.NAME, cron: '0 0 */24 ? * *' },
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
// { scraper: require('../scrapers/1337x/1337x_search_scraper') }
// { scraper: require('../scrapers/rarbg/rarbg_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_unofficial_dump_scraper') }
// { scraper: require('../scrapers/thepiratebay/thepiratebay_update_size_scraper') }
];

View File

@@ -0,0 +1,54 @@
const Bottleneck = require('bottleneck');
const repository = require('../lib/repository')
const { delay } = require('../lib/promises')
const { updateCurrentSeeders } = require('../lib/torrent')
const { updateTorrentSeeders } = require('../lib/torrentEntries')
const DELAY_MS = 0; // 0 seconds
const updateLimiter = new Bottleneck({ maxConcurrent: 5 });
const statistics = {};
const statisticsNew = {};
function scheduleUpdateSeeders() {
console.log('Starting seeders update...')
getTorrents()
.then(torrents => updateCurrentSeeders(torrents))
.then(updatedTorrents => Promise.all(
updatedTorrents.map(updated => updateLimiter.schedule(() => updateTorrentSeeders(updated)))))
.then(torrents => updateStatistics(torrents, statistics))
.then(() => console.log('Finished seeders update:', statistics))
.catch(error => console.warn('Failed seeders update:', error))
.then(() => delay(DELAY_MS))
.then(() => scheduleUpdateSeeders());
}
function scheduleUpdateSeedersForNewTorrents() {
console.log('Starting seeders update for new torrents...')
getNewTorrents()
.then(torrents => updateCurrentSeeders(torrents))
.then(updatedTorrents => Promise.all(
updatedTorrents.map(updated => updateLimiter.schedule(() => updateTorrentSeeders(updated)))))
.then(torrents => updateStatistics(torrents, statisticsNew))
.then(() => console.log('Finished seeders update for new torrents:', statisticsNew))
.catch(error => console.warn('Failed seeders update for new torrents:', error))
.then(() => delay(30_000))
.then(() => scheduleUpdateSeedersForNewTorrents());
}
async function getTorrents() {
return repository.getUpdateSeedersTorrents()
.catch(() => delay(5000).then(() => getTorrents()))
}
async function getNewTorrents() {
return repository.getUpdateSeedersNewTorrents()
.catch(() => delay(5000).then(() => getNewTorrents()))
}
function updateStatistics(updatedTorrents, statisticsObject) {
const totalTorrents = updatedTorrents.map(nested => nested.length).reduce((a, b) => a + b, 0);
const date = new Date().toISOString().replace(/T.*/, '');
statisticsObject[date] = (statisticsObject[date] || 0) + totalTorrents;
}
module.exports = { scheduleUpdateSeeders, scheduleUpdateSeedersForNewTorrents }

View File

@@ -0,0 +1,182 @@
const cheerio = require('cheerio');
const needle = require('needle');
const Sugar = require('sugar-date');
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const defaultProxies = [
'https://1337x.to'
];
const defaultTimeout = 10000;
const maxSearchPage = 50;
const Categories = {
MOVIE: 'Movies',
TV: 'TV',
ANIME: 'Anime',
DOCUMENTARIES: 'Documentaries',
APPS: 'Apps',
GAMES: 'Games',
MUSIC: 'Music',
PORN: 'XXX',
OTHER: 'Other',
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const proxyList = config.proxyList || defaultProxies;
const slug = torrentId.startsWith('/torrent/') ? torrentId.replace('/torrent/', '') : torrentId;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${slug}`, config)))
.then((body) => parseTorrentPage(body))
.then((torrent) => ({ torrentId: slug, ...torrent }))
.catch((err) => torrent(slug, config, retries - 1));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
const requestUrl = proxyUrl => category
? `${proxyUrl}/category-search/${keyword}/${category}/${page}/`
: `${proxyUrl}/search/${keyword}/${page}/`;
return Promises.first(proxyList
.map(proxyUrl => singleRequest(requestUrl(proxyUrl), config)))
.then(body => parseTableBody(body))
.then(torrents => torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
const sort = config.sort;
const requestUrl = proxyUrl => sort
? `${proxyUrl}/sort-cat/${category}/${sort}/desc/${page}/`
: `${proxyUrl}/cat/${category}/${page}/`;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(requestUrl(proxyUrl), config)))
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden') ||
!(body.includes('1337x</title>'))) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('.table > tbody > tr').each((i, element) => {
const row = $(element);
torrents.push({
name: row.find('a').eq(1).text(),
torrentId: row.find('a').eq(1).attr('href').replace('/torrent/', ''),
seeders: parseInt(row.children('td.coll-2').text()),
leechers: parseInt(row.children('td.coll-3').text()),
size: parseSize(row.children('td.coll-4').text())
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const details = $('.torrent-detail-page');
const magnetLink = details.find('a:contains(\'Magnet Download\')').attr('href');
const imdbIdMatch = details.find('div[id=\'description\']').html().match(/imdb\.com\/title\/(tt\d+)/i);
const torrent = {
name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
seeders: parseInt(details.find('strong:contains(\'Seeders\')').next().text(), 10),
leechers: parseInt(details.find('strong:contains(\'Leechers\')').next().text(), 10),
category: details.find('strong:contains(\'Category\')').next().text(),
languages: details.find('strong:contains(\'Language\')').next().text(),
size: parseSize(details.find('strong:contains(\'Total size\')').next().text()),
uploadDate: parseDate(details.find('strong:contains(\'Date uploaded\')').next().text()),
imdbId: imdbIdMatch && imdbIdMatch[1],
files: details.find('div[id=\'files\']').first().find('li')
.map((i, elem) => $(elem).text())
.map((i, text) => ({
fileIndex: i,
name: text.match(/^(.+)\s\(.+\)$/)[1].replace(/^.+\//g, ''),
path: text.match(/^(.+)\s\(.+\)$/)[1],
size: parseSize(text.match(/^.+\s\((.+)\)$/)[1])
})).get()
};
resolve(torrent);
});
}
function parseDate(dateString) {
if (/decade.*ago/i.test(dateString)) {
return Sugar.Date.create('10 years ago');
}
return Sugar.Date.create(dateString);
}
function parseSize(sizeText) {
if (!sizeText) {
return undefined;
}
let scale = 1;
if (sizeText.includes('GB')) {
scale = 1024 * 1024 * 1024
} else if (sizeText.includes('MB')) {
scale = 1024 * 1024;
} else if (sizeText.includes('KB')) {
scale = 1024;
}
return Math.floor(parseFloat(sizeText.replace(/,/g, '')) * scale);
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,111 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const leetx = require('./1337x_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = '1337x';
const UNTIL_PAGE = 10;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => leetx.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
leetx.Categories.MOVIE,
leetx.Categories.TV,
leetx.Categories.ANIME,
leetx.Categories.DOCUMENTARIES
];
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return leetx.browse(({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent({ provider: NAME, ...record })) {
return record;
}
const torrentFound = await leetx.torrent(record.torrentId).catch(() => undefined);
if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) {
return Promise.resolve('Invalid torrent record');
}
if (isNaN(torrentFound.uploadDate)) {
console.warn(`Incorrect upload date for [${torrentFound.infoHash}] ${torrentFound.name}`);
return;
}
if (await checkAndUpdateTorrent(torrentFound)) {
return torrentFound;
}
const torrent = {
infoHash: torrentFound.infoHash,
provider: NAME,
torrentId: torrentFound.torrentId,
title: torrentFound.name.replace(/\t|\s+/g, ' ').trim(),
type: TYPE_MAPPING[torrentFound.category],
size: torrentFound.size,
seeders: torrentFound.seeders,
uploadDate: torrentFound.uploadDate,
imdbId: torrentFound.imdbId,
languages: torrentFound.languages || undefined
};
return createTorrentEntry(torrent);
}
function typeMapping() {
const mapping = {};
mapping[leetx.Categories.MOVIE] = Type.MOVIE;
mapping[leetx.Categories.DOCUMENTARIES] = Type.SERIES;
mapping[leetx.Categories.TV] = Type.SERIES;
mapping[leetx.Categories.ANIME] = Type.ANIME;
return mapping;
}
function untilPage(category) {
if (leetx.Categories.ANIME === category) {
return 5;
}
if (leetx.Categories.DOCUMENTARIES === category) {
return 1;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,133 @@
const needle = require("needle");
const cheerio = require("cheerio");
const decode = require("magnet-uri");
const Promises = require("../../lib/promises");
const { getRandomUserAgent } = require("../../lib/requestHelper");
const defaultTimeout = 10000;
const maxSearchPage = 50;
const baseUrl = 'https://animestorrent.com';
const Categories = {
MOVIE: 'filme',
ANIME: 'tv',
OVA: 'ova'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = torrentId.split("/")[3];
return singleRequest(`${baseUrl}/${slug}`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el })))
.catch((err) => torrent(slug, config, retries - 1));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1);
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
.then((body) => parseTableBody(body))
.then((torrents) =>
torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 })
.catch(() => [])
.then((nextTorrents) => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
const requestUrl = category ? `${baseUrl}/tipo/${category}/page/${page}/` : `${baseUrl}/page/${page}/`;
return singleRequest(requestUrl, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = {
userAgent: getRandomUserAgent(),
open_timeout: timeout,
follow: 2,
};
return needle("get", requestUrl, options).then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (
body.includes("502: Bad gateway") ||
body.includes("403 Forbidden")
) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
const torrents = [];
$("article.bs").each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("span.ntitle").text(),
torrentId: row.find("div > a").attr("href"),
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise(async (resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
let magnets = [];
$(`a[href^="magnet"]`).each((i, section) => {
const magnet = $(section).attr("href");
magnets.push(magnet);
});
const details = $('div.infox')
const torrents = magnets.map((magnetLink) => {
return {
title: decode(magnetLink).name,
originalName: details.find('h1.entry-title').text(),
year: details.find('b:contains(\'Lançamento:\')')[0]
? details.find('b:contains(\'Lançamento:\')')[0].nextSibling.nodeValue.trim()
: '',
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4],
uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")),
};
})
resolve(torrents);
});
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,108 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const animetorrrent = require("./animestorrent_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const { getKitsuId } = require("../../lib/metadata");
const NAME = "AnimesTorrent";
const UNTIL_PAGE = 5;
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => animetorrrent.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
animetorrrent.Categories.MOVIE,
animetorrrent.Categories.ANIME,
animetorrrent.Categories.OVA
];
return Promises.sequence(allowedCategories
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return animetorrrent
.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
return animetorrrent.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
if (!foundTorrent.imdbId && !foundTorrent.kitsuId) {
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
foundTorrent.kitsuId = await getKitsuId(info).catch(() => undefined);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: Type.ANIME,
imdbId: foundTorrent.imdbId,
kitsuId: foundTorrent.kitsuId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function untilPage(category) {
if (animetorrrent.Categories.ANIME === category) {
return 5;
}
if (animetorrrent.Categories.OVA === category) {
return 3;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,161 @@
const needle = require("needle")
const moment = require("moment")
const cheerio = require("cheerio");
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtLanguages, sanitizePtOriginalName } = require('../scraperHelper')
const defaultTimeout = 30000;
const maxSearchPage = 50
const baseUrl = 'https://comando.to';
const Categories = {
MOVIE: 'filmes',
TV: 'series',
ANIME: 'animes',
DOCUMENTARIES: 'documentario'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = torrentId.split("/")[3];
return singleRequest(`${baseUrl}/${slug}`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
.catch((err) => {
console.warn(`Failed Comando ${slug} request: `, err);
return torrent(torrentId, config, retries - 1)
});
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
.then(body => parseTableBody(body))
.then(torrents => torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
return singleRequest(`${baseUrl}/category/${category}/page/${page}/`, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('article').each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("h2 > a").text(),
torrentId: row.find("h2 > a").attr("href")
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const magnets = $('h2 > strong')
.filter((i, elem) => isPtDubbed($(elem).text())).parent()
.map((i, elem) => $(elem).nextUntil('h2, hr'))
.map((i, elem) => $(elem).find('a[href^="magnet"]'))
.map((i, section) => $(section).attr("href")).get();
const details = $('strong, b').filter((i, elem) => $(elem).text().match(/Servidor|Orig(?:\.|inal)/)).parent();
const imdbIdMatch = details.find('a[href*="imdb.com"]').attr('href')
const torrents = magnets.map(magnetLink => {
const decodedMagnet = decode(magnetLink);
const originalNameElem = details.find('strong, b')
.filter((i, elem) => $(elem).text().match(/Baixar|Orig(?:\.|inal)|^Título:/));
const languagesElem = details.find('strong, b')
.filter((i, elem) => $(elem).text().match(/^\s*([IÍ]dioma|[AÁ]udio)/));
const originalName = parseOriginalName(originalNameElem);
const title = decodedMagnet.name && escapeHTML(decodedMagnet.name.replace(/\+/g, ' '));
return {
title: title ? sanitizePtName(title) : originalName,
originalName: sanitizePtOriginalName(originalName),
year: details.find('a[href*="comando.to/category/"]').text(),
infoHash: decodedMagnet.infoHash,
magnetLink: magnetLink,
category: parseCategory($('div.entry-categories').html()),
uploadDate: new Date(moment($('a.updated').text(), 'LL', 'pt-br').format()),
imdbId: imdbIdMatch ? imdbIdMatch.split('/')[4] : null,
languages: sanitizePtLanguages(languagesElem[0].nextSibling.nodeValue)
}
});
resolve(torrents.filter((x) => x));
});
}
function parseOriginalName(originalNameElem) {
if (!originalNameElem[0]) {
return '';
}
const originalName = originalNameElem.next().text().trim() || originalNameElem[0].nextSibling.nodeValue;
return originalName.replace(/: ?/, '');
}
function parseCategory(categorys) {
const $ = cheerio.load(categorys)
if ($('a:contains(\'animes\')').text()) {
return Categories.ANIME
}
if ($('a:contains(\'Filmes\')').text()) {
return Categories.MOVIE
}
if ($('a:contains(\'Series\')').text()) {
return Categories.TV
}
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,112 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const comando = require("./comando_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { getImdbId } = require("../../lib/metadata");
const NAME = "Comando";
const UNTIL_PAGE = 5;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => comando.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
comando.Categories.MOVIE,
comando.Categories.TV
];
return Promises.sequence(allowedCategories
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return comando.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
return comando.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
if (!foundTorrent.imdbId && foundTorrent.originalName) {
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
imdbId: foundTorrent.imdbId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function typeMapping() {
const mapping = {};
mapping[comando.Categories.MOVIE] = Type.MOVIE;
mapping[comando.Categories.DOCUMENTARIES] = Type.SERIES;
mapping[comando.Categories.TV] = Type.SERIES;
mapping[comando.Categories.ANIME] = Type.ANIME;
return mapping;
}
function untilPage(category) {
if (comando.Categories.TV === category) {
return 5;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,135 @@
const needle = require("needle")
const cheerio = require("cheerio");
const decode = require('magnet-uri');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtLanguages } = require('../scraperHelper')
const defaultTimeout = 10000;
const maxSearchPage = 50
const baseUrl = 'https://comoeubaixo.com';
const Categories = {
MOVIE: 'filmes',
TV: 'series',
ANIME: 'anime',
DESENHOS: 'desenhos'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = encodeURIComponent(torrentId.split("/")[3]);
return singleRequest(`${baseUrl}/${slug}/`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
.catch((err) => {
console.warn(`Failed ComoEuBaixo ${torrentId} request: `, err);
return torrent(torrentId, config, retries - 1)
});
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
return singleRequest(`${baseUrl}/${keyword}/${page}/`, config)
.then(body => parseTableBody(body))
.then(torrents => torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
const requestUrl = category ? `${baseUrl}/${category}/${page}/` : `${baseUrl}/${page}/`;
return singleRequest(requestUrl, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body || (Buffer.isBuffer(body) && !body.size)) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('div.capa_larga.align-middle').each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("a").text(),
torrentId: row.find("a").attr("href"),
isTorrent: !!row.find("p:contains(\'Torrent\')")[0]
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const magnets = $(`a[href^="magnet"]`)
.filter((i, elem) => isPtDubbed($(elem).attr('title')))
.map((i, elem) => $(elem).attr("href")).get();
const details = $('div#informacoes')
const category = details.find('strong:contains(\'Gêneros: \')').next().attr('href').split('/')[0]
const torrents = magnets.map(magnetLink => {
const decodedMagnet = decode(magnetLink);
const name = escapeHTML(decodedMagnet.name || '').replace(/\+/g, ' ');
const sanitizedTitle = sanitizePtName(name);
const originalTitle = details.find('strong:contains(\'Baixar\')')[0].nextSibling.nodeValue.split('-')[0];
const year = details.find('strong:contains(\'Data de Lançamento: \')').next().text().trim();
const fallBackTitle = `${originalTitle.trim()} ${year.trim()} ${sanitizedTitle.trim()}`;
return {
title: sanitizedTitle.length > 5 ? sanitizedTitle : fallBackTitle,
infoHash: decodedMagnet.infoHash,
magnetLink: magnetLink,
category: category,
uploadDate: new Date($('time').attr('datetime')),
imdbId: details.find('a[href*="imdb.com"]').attr('href').split('/')[4],
languages: sanitizePtLanguages(details.find('strong:contains(\'Idioma\')')[0].nextSibling.nodeValue)
};
})
resolve(torrents.filter((x) => x));
});
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,115 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const comoeubaixo = require("./comoeubaixo_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const NAME = "ComoEuBaixo";
const UNTIL_PAGE = 5;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => comoeubaixo.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
comoeubaixo.Categories.MOVIE,
comoeubaixo.Categories.TV,
comoeubaixo.Categories.DESENHOS
];
return Promises.sequence(allowedCategories
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return comoeubaixo
.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
if (!entry.isTorrent) {
return entry;
}
return comoeubaixo.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
imdbId: foundTorrent.imdbId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function typeMapping() {
const mapping = {};
mapping[comoeubaixo.Categories.MOVIE] = Type.MOVIE;
mapping[comoeubaixo.Categories.TV] = Type.SERIES;
mapping[comoeubaixo.Categories.ANIME] = Type.ANIME;
mapping[comoeubaixo.Categories.DESENHOS] = Type.SERIES;
return mapping;
}
function untilPage(category) {
if (comoeubaixo.Categories.DESENHOS === category) {
return UNTIL_PAGE;
}
if (comoeubaixo.Categories.TV === category) {
return UNTIL_PAGE;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,132 @@
const needle = require("needle");
const cheerio = require("cheerio");
const decode = require("magnet-uri");
const Promises = require("../../lib/promises");
const { getRandomUserAgent } = require("../../lib/requestHelper");
const defaultTimeout = 10000;
const maxSearchPage = 50;
const baseUrl = 'https://darkmahou.com';
const Categories = {
MOVIE: 'movie',
ANIME: 'tv',
OVA: 'ova'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = torrentId.split("/")[3];
return singleRequest(`${baseUrl}/${slug}`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map((el) => ({ torrentId: slug, ...el })))
.catch((err) => torrent(slug, config, retries - 1));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, config.extendToPage || 1);
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
.then((body) => parseTableBody(body))
.then((torrents) =>
torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 })
.catch(() => [])
.then((nextTorrents) => torrents.concat(nextTorrents))
: torrents
)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
const requestUrl = category ? `${baseUrl}/category/${category}/page/${page}/` : `${baseUrl}/page/${page}/`;
return singleRequest(requestUrl, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = {
userAgent: getRandomUserAgent(),
open_timeout: timeout,
follow: 2,
};
return needle("get", requestUrl, options).then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (
body.includes("502: Bad gateway") ||
body.includes("403 Forbidden")
) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
const torrents = [];
$("article.bs").each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("span.ntitle").text(),
torrentId: row.find("div > a").attr("href"),
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise(async (resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
let magnets = [];
$(`a[href^="magnet"]`).each((i, section) => {
const magnet = $(section).attr("href");
magnets.push(magnet);
});
const details = $('div.infox')
const torrent = magnets.map((magnetLink) => {
return {
title: decode(magnetLink).name,
originalName: details.find('h1.entry-title').text(),
year: details.find('b:contains(\'Lançado:\')')[0].nextSibling.nodeValue || '',
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
category: details.find('b:contains(\'Tipo:\')').next().attr('href').split('/')[4],
uploadDate: new Date($("time[itemprop=dateModified]").attr("datetime")),
};
})
resolve(torrent.filter((x) => x));
});
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,108 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const darkmahou = require("./darkmahou_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const { getKitsuId } = require("../../lib/metadata");
const NAME = "DarkMahou";
const UNTIL_PAGE = 5;
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => darkmahou.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
darkmahou.Categories.MOVIE,
darkmahou.Categories.ANIME,
darkmahou.Categories.OVA
];
return Promises.sequence(allowedCategories
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return darkmahou
.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
return darkmahou.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
if (!foundTorrent.imdbId && !foundTorrent.kitsuId) {
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
foundTorrent.kitsuId = await getKitsuId(info).catch(() => undefined);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: Type.ANIME,
imdbId: foundTorrent.imdbId,
kitsuId: foundTorrent.kitsuId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function untilPage(category) {
if (darkmahou.Categories.ANIME === category) {
return 5;
}
if (darkmahou.Categories.OVA === category) {
return 4;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,94 @@
const needle = require("needle");
const axios = require('axios');
const cheerio = require("cheerio");
const decode = require("magnet-uri");
const Promises = require("../../lib/promises");
const { getRandomUserAgent } = require("../../lib/requestHelper");
const defaultTimeout = 10000;
const baseUrl = 'https://www.erai-raws.info';
const Categories = {
ANIMES: 'anime',
EPISODES: 'episodes'
};
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
return singleRequest(`${baseUrl}/${category}/page/${page}/`, config)
.then((body) => parseTableBody(body)
.then(animes => Promises.sequence(animes.map(anime => () => singleRequest(anime.animeLink))))
.then(animeBodies => Promise.all(animeBodies.map(animeBody => parseTorrentPage(animeBody))))
.then(animeInfos => animeInfos.reduce((a, b) => a.concat(b), [])))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), timeout: timeout, follow: 2, };
return axios.get(requestUrl, options).then((response) => {
const body = response.data;
if (!body || (Buffer.isBuffer(body) && !body.size)) {
throw new Error(`No body: ${requestUrl}`);
} else if (
body.includes("502: Bad gateway") ||
body.includes("403 Forbidden")
) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
const links = $('[itemprop=\'headline\'] a, .content-area a.aa_ss_ops_new')
.map((i, element) => ({
name: $(element).text(),
animeLink: $(element).attr("href"),
})).get();
resolve(links);
});
}
function parseTorrentPage(body) {
return new Promise(async (resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error("Failed loading body"));
}
const entries = $('.tab-content table, .content-area table')
.map((i, entry) => {
const languages = $(entry).find('.tooltip3').map((_, l) => $(l).attr('data-title')).get().join('/');
const magnets = $(entry).find('a[href^="magnet"]').map((_, m) => $(m).attr('href')).get();
return { languages, magnets }
}).get();
const torrents = entries
.map(entry => entry.magnets
.map(magnet => decode(magnet))
.map(decodedMagnet => ({
title: decodedMagnet.name,
infoHash: decodedMagnet.infoHash,
trackers: decodedMagnet.tr,
languages: entry.languages
})))
.reduce((a, b) => a.concat(b), []);
resolve(torrents);
});
}
module.exports = { browse, Categories };

View File

@@ -0,0 +1,47 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const erairaws = require('./erairaws_api');
const { checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'EraiRaws';
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function scrapeLatestTorrents() {
return scrapeLatestTorrentsForCategory(erairaws.Categories.EPISODES)
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return erairaws.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processRecord(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve([]));
}
async function processRecord(foundTorrent) {
return checkAndUpdateTorrent({ provider: NAME, ...foundTorrent }).then(() => foundTorrent);
}
function untilPage(category) {
if (category === erairaws.Categories.ANIMES) {
return 45;
}
return 3;
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,135 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const Promises = require('../../lib/promises');
const { getRandomUserAgent } = require('./../../lib/requestHelper');
const defaultProxies = [
'https://eztv.re'
];
const defaultTimeout = 120000;
const minDelay = 3000;
const jitterDelay = minDelay;
const limit = 100;
const maxPage = 5;
function torrent(torrentId, config = {}, retries = 1) {
if (!torrentId) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/ep/${torrentId}`, config)))
.then(body => parseTorrentPage(body))
.then(torrent => ({ torrentId, ...torrent }))
.catch(error => retries ? jitter().then(() => torrent(torrentId, config, retries - 1)) : Promise.reject(error));
}
function search(imdbId, config = {}, retries = 1) {
if (!imdbId) {
return Promise.reject(new Error(`Failed ${imdbId} search`));
}
const id = imdbId.replace('tt', '');
const page = config.page || 1;
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}&imdb_id=${id}`, config)))
.then(results => parseResults(results))
.then(torrents => torrents.length === limit && page < maxPage
? search(imdbId, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch(error => retries ? jitter().then(() => search(imdbId, config, retries - 1)) : Promise.reject(error));
}
function browse(config = {}, retries = 1) {
const page = config.page || 1;
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/get-torrents?limit=${limit}&page=${page}`, config)))
.then(results => parseResults(results))
.catch(error => retries ? jitter().then(() => browse(config, retries - 1)) : Promise.reject(error));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = {
userAgent: getRandomUserAgent(),
open_timeout: timeout,
response_timeout: timeout,
read_timeout: timeout,
follow: 2
};
return needle('get', requestUrl, options)
.then(response => {
if (!response.body) {
return Promise.reject(`No body: ${requestUrl}`);
}
return Promise.resolve(response.body);
});
}
function parseResults(results) {
if (!results || !Array.isArray(results.torrents)) {
return Promise.reject(`Incorrect results ${results}`)
}
return results.torrents.map(torrent => parseTorrent(torrent));
}
function parseTorrent(torrent) {
return {
name: torrent.title.replace(/EZTV$/, ''),
torrentId: torrent.episode_url.replace(/.*\/ep\//, ''),
infoHash: torrent.hash.trim().toLowerCase(),
magnetLink: torrent.magnet_url,
torrentLink: torrent.torrent_url,
seeders: torrent.seeds,
size: torrent.size_bytes,
uploadDate: new Date(torrent.date_released_unix * 1000),
imdbId: torrent.imdb_id !== '0' && 'tt' + torrent.imdb_id || undefined
}
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const content = $('table[class="forum_header_border_normal"]');
const torrent = {
name: content.find('h1 > span').text().replace(/EZTV$/, ''),
infoHash: content.find('b:contains(\'Torrent Hash:\')')[0].nextSibling.data.trim().toLowerCase(),
magnetLink: content.find('a[title="Magnet Link"]').attr('href'),
torrentLink: content.find('a[title="Download Torrent"]').attr('href'),
seeders: parseInt(content.find('span[class="stat_red"]').first().text(), 10) || 0,
size: parseSize(content.find('b:contains(\'Filesize:\')')[0].nextSibling.data),
uploadDate: moment(content.find('b:contains(\'Released:\')')[0].nextSibling.data, 'Do MMM YYYY').toDate(),
showUrl: content.find('.episode_left_column a').attr('href')
};
resolve(torrent);
});
}
function parseSize(sizeText) {
if (!sizeText) {
return undefined;
}
let scale = 1;
if (sizeText.includes('GB')) {
scale = 1024 * 1024 * 1024
} else if (sizeText.includes('MB')) {
scale = 1024 * 1024;
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
scale = 1024;
}
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
}
function jitter() {
return Promises.delay(minDelay + Math.round(Math.random() * jitterDelay))
}
module.exports = { torrent, search, browse };

View File

@@ -0,0 +1,85 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const eztv = require('./eztv_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const { isEpisodeImdbId } = require('../../lib/metadata');
const NAME = 'EZTV';
const UNTIL_PAGE = 10;
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent, getImdbIdsMethod) {
// return getImdbIdsMethod()
// .then(imdbIds => Promise.all(imdbIds.map(imdbId => limiter.schedule(() => eztv.search(imdbId)))))
// .then(results => results.reduce((a, b) => a.concat(b), []))
// .catch(() => limiter.schedule(() => eztv.torrent(torrent.torrentId)));
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
return scrapeLatestTorrentsForCategory();
}
async function scrapeLatestTorrentsForCategory(page = 1) {
console.log(`Scrapping ${NAME} page ${page}`);
return eztv.browse(({ page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] due: `, error);
// return Promises.delay(30000).then(() => scrapeLatestTorrentsForCategory(page))
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
if (!record || !record.size) {
return Promise.resolve('Invalid torrent record');
}
// imdb id for talk shows is usually incorrect on eztv
const parsedTitle = parse(record.name);
const dateEpisode = !parsedTitle.season && parsedTitle.date;
if (dateEpisode && await isEpisodeImdbId(record.imdbId)) {
delete record.imdbId;
}
const torrent = {
infoHash: record.infoHash,
provider: NAME,
torrentId: record.torrentId,
title: record.name.replace(/\t|\s+/g, ' ').trim(),
type: Type.SERIES,
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
imdbId: record.imdbId,
};
return createTorrentEntry(torrent).then(() => torrent);
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,137 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const Promises = require('../../lib/promises');
const defaultUrl = 'https://horriblesubs.info';
const defaultTimeout = 10000;
function allShows(config = {}) {
return _getContent('/shows/', config)
.then(($) => $('div[class="ind-show"]')
.map((index, element) => $(element).children('a'))
.map((index, element) => ({
title: element.attr('title'),
url: `${config.proxyUrl || defaultUrl}${element.attr('href')}`
})).get());
}
async function showData(showInfo, config = {}) {
const showEndpoint = (showInfo.url || showInfo).match(/\/show.+/)[0];
const title = showInfo.title;
const showId = await _getShowId(showEndpoint);
const packEntries = await _getShowEntries(showId, title, 'batch', config);
const singleEntries = await _getShowEntries(showId, title, 'show', config);
return {
title: title,
url: showInfo.url || showInfo,
showId: showId,
singleEpisodes: singleEntries,
packEpisodes: packEntries
};
}
async function getLatestEntries(config = {}) {
return _getAllLatestEntries(config)
.then((entries) => Promises.sequence(entries.map((entry) => () => _findLatestEntry(entry, config))))
.then((entries) => entries.filter((entry) => entry))
}
function _getContent(endpoint, config = {},) {
const baseUrl = config.proxyUrl || defaultUrl;
const timeout = config.timeout || defaultTimeout;
const url = endpoint.startsWith('http')
? endpoint.replace(/https?:\/\/[^/]+/, baseUrl)
: `${baseUrl}${endpoint}`;
return needle('get', url, { open_timeout: timeout })
.then((response) => response.body)
.then((body) => cheerio.load(body));
}
function _getShowId(showEndpoint) {
return _getContent(showEndpoint.replace(/(?:#\d+)?\/?$/, '/'))
.then($ => $('div.entry-content').find('script').html().match(/var hs_showid = (\d+)/)[1]);
}
function _getShowEntries(animeId, animeTitle, type, config) {
return _getAllEntries(animeId, type, config)
.then((entries) => entries.filter((entry) => entry.title === animeTitle));
}
function _getAllEntries(animeId, type, config, page = 0, autoExtend = true) {
const entriesEndpoint = `/api.php?method=getshows&type=${type}&showid=${animeId}&nextid=${page}`;
return _getEntries(entriesEndpoint, config)
.then((entries) => !autoExtend || !entries.length ? entries :
_getAllEntries(animeId, type, config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)));
}
function _getEntries(endpoint, config) {
return _getContent(endpoint, config)
.then(($) => $('div[class="rls-info-container"]')
.map((index, element) => ({
title: $(element).find('a[class="rls-label"]').contents()
.filter((i, el) => el.nodeType === 3).first().text().trim(),
episode: $(element).find('a[class="rls-label"]').find('strong').text(),
uploadDate: _parseDate($(element).find('a[class="rls-label"]').find('span[class="rls-date"]').text()),
mirrors: $(element).find('div[class="rls-links-container"]').children()
.map((indexLink, elementLink) => ({
resolution: $(elementLink).attr('id').match(/\d+p$/)[0],
magnetLink: $(elementLink).find('a[title="Magnet Link"]').attr('href'),
torrentLink: $(elementLink).find('a[title="Torrent Link"]').attr('href')
})).get()
})).get());
}
function _getAllLatestEntries(config, page = 0) {
const pageParam = page === 0 ? '' : `&nextid=${page}`;
const entriesEndpoint = `/api.php?method=getlatest${pageParam}`;
return _getContent(entriesEndpoint, config)
.then(($) => $('li a')
.map((index, element) => ({
urlEndpoint: $(element).attr('href'),
episode: $(element).find('strong').text()
})).get())
.then((entries) => entries.length < 12
? entries
: _getAllLatestEntries(config, page + 1)
.then((nextEntries) => entries.concat(nextEntries)));
}
async function _findLatestEntry(entry, config) {
const showId = await _getShowId(entry.urlEndpoint);
let foundEntry;
let page = 0;
let reachedEnd = false;
while (!foundEntry && !reachedEnd) {
const allEntries = await _getAllEntries(showId, 'show', config, page, false);
foundEntry = allEntries.filter((e) => e.episode === entry.episode)[0];
page = page + 1;
reachedEnd = allEntries.length === 0;
}
if (!foundEntry) {
return;
}
return {
title: foundEntry.title,
url: entry.urlEndpoint,
showId: showId,
singleEpisodes: [foundEntry]
};
}
function _parseDate(date) {
if (date.match(/today/i)) {
return moment().toDate();
} else if (date.match(/yesterday/i)) {
return moment().subtract(1, 'day').toDate();
}
return moment(date, 'MM/DD/YYYY').toDate();
}
module.exports = { allShows, showData, getLatestEntries, _getShowId };

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,186 @@
const fs = require('fs');
const moment = require('moment');
const Bottleneck = require('bottleneck');
const decode = require('magnet-uri');
const horriblesubs = require('./horriblesubs_api.js');
const repository = require('../../lib/repository');
const { Type } = require('../../lib/types');
const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const { getMetadata, getKitsuId } = require('../../lib/metadata');
const showMappings = require('./horriblesubs_mapping.json');
const NAME = 'HorribleSubs';
const NEXT_FULL_SCRAPE_OFFSET = 5 * 24 * 60 * 60; // 5 days;
const limiter = new Bottleneck({ maxConcurrent: 5 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
const lastScraped = lastScrape.lastScraped && moment(lastScrape.lastScraped);
if (!lastScraped || lastScraped.add(NEXT_FULL_SCRAPE_OFFSET, 'seconds') < scrapeStart) {
console.log(`[${scrapeStart}] scrapping all ${NAME} shows...`);
return _scrapeAllShows()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished scrapping all ${NAME} shows`));
} else {
console.log(`[${scrapeStart}] scrapping latest ${NAME} entries...`);
return _scrapeLatestEntries()
.then(() => console.log(`[${moment()}] finished scrapping latest ${NAME} entries`));
}
}
async function updateSeeders(torrent) {
return Promise.resolve([]);
}
async function _scrapeLatestEntries() {
const latestEntries = await horriblesubs.getLatestEntries();
return Promise.all(latestEntries
.map((entryData) => limiter.schedule(() => _parseShowData(entryData)
.catch((err) => console.log(err)))));
}
async function _scrapeAllShows() {
const shows = await horriblesubs.allShows();
return Promise.all(shows
.map((show) => limiter.schedule(() => horriblesubs.showData(show)
.then((showData) => _parseShowData(showData, false))
.catch((err) => console.log(err)))));
}
async function compareSearchKitsuIds() {
console.log(`${NAME}: initiating kitsu compare...`);
const shows = await horriblesubs.allShows()
.then((shows) => Promise.all(shows.slice(0, 1).map((show) => limiter.schedule(() => enrichShow(show)))));
const incorrect = shows.filter(
(show) => showMappings[show.title] && showMappings[show.title].kitsu_id !== show.kitsu_id);
const incorrectRatio = incorrect.length / shows.length;
console.log(incorrect);
console.log(`Ratio: ${incorrectRatio}`);
}
async function initMapping() {
console.log(`${NAME}: initiating kitsu mapping...`);
const shows = await horriblesubs.allShows()
.then((shows) => shows.filter((show) => !showMappings[show.title]))
.then((shows) => Promise.all(shows.map((show) => limiter.schedule(() => enrichShow(show)))))
.then((shows) => shows.reduce((map, show) => (map[show.title] = show, map), showMappings));
fs.writeFile(
"./scraper/scrapers/horriblesubs/horriblesubs_mapping.json",
JSON.stringify(shows), 'utf8',
(err) => {
if (err) {
console.log("An error occurred while writing JSON Object to File.", err);
} else {
console.log(`${NAME}: finished kitsu mapping`);
}
}
);
}
async function enrichShow(show) {
console.log(`${NAME}: getting show info for ${show.title}...`);
const showId = await horriblesubs._getShowId(show.url)
.catch(() => show.title);
const metadata = await getKitsuId({ title: show.title })
.then((kitsuId) => getMetadata(kitsuId))
.catch((error) => {
console.log(`Failed getting kitsu meta: ${error.message}`);
return {};
});
return {
showId: showId,
kitsu_id: metadata.kitsuId,
...show,
kitsuTitle: metadata.title,
imdb_id: metadata.imdbId
}
}
async function _parseShowData(showData, updateSeeders = true) {
console.log(`${NAME}: scrapping ${showData.title} data...`);
const showMapping = showMappings[showData.title];
const kitsuId = showMapping && showMapping.kitsu_id;
if (!showMapping) {
throw new Error(`No kitsu mapping found for ${showData.title}`);
}
if (!kitsuId) {
throw new Error(`No kitsuId found for ${showData.title}`);
}
// sometimes horriblesubs entry contains multiple season in it, so need to split it per kitsu season entry
const kitsuIdsMapping = Array.isArray(kitsuId) && await Promise.all(kitsuId.map(kitsuId => getMetadata(kitsuId)))
.then((metas) => metas.reduce((map, meta) => {
const epOffset = Object.keys(map).length;
[...Array(meta.totalCount || 1).keys()]
.map(ep => ep + 1)
.forEach(ep => map[ep + epOffset] = { kitsuId: meta.kitsuId, episode: ep, title: meta.title });
return map;
}, {})) || {};
const formatTitle = (episodeInfo, mirror) => {
const mapping = kitsuIdsMapping[episodeInfo.episode.replace(/^0+/, '')];
if (mapping) {
return `${mapping.title} - ${mapping.episode} [${mirror.resolution}]`;
}
return `${episodeInfo.title} - ${episodeInfo.episode} [${mirror.resolution}]`;
};
const getKitsuId = inputEpisode => {
const episodeString = inputEpisode.includes('-') && inputEpisode.split('-')[0] || inputEpisode;
const episode = parseInt(episodeString, 10);
if (kitsuIdsMapping[episode]) {
return kitsuIdsMapping[episode].kitsuId;
} else if (Array.isArray(kitsuId)) {
console.warn(`Unmapped episode number for ${showData.title} - ${inputEpisode}`);
return undefined;
}
return kitsuId;
};
return Promise.all([].concat(showData.singleEpisodes || []).concat(showData.packEpisodes || [])
.map(episodeInfo => episodeInfo.mirrors
.filter(mirror => mirror.magnetLink && mirror.magnetLink.length)
.map(mirror => ({
provider: NAME,
...mirror,
infoHash: decode(mirror.magnetLink).infoHash,
trackers: decode(mirror.magnetLink).tr.join(','),
title: formatTitle(episodeInfo, mirror),
type: Type.ANIME,
kitsuId: getKitsuId(episodeInfo.episode),
uploadDate: episodeInfo.uploadDate,
})))
.reduce((a, b) => a.concat(b), [])
.filter(torrent => torrent.kitsuId)
.map(torrent => entryLimiter.schedule(() => processTorrentRecord(torrent, updateSeeders))))
.then(() => console.log(`${NAME}: finished scrapping ${showData.title} data`));
}
async function processTorrentRecord(torrent, updateSeeders = true) {
const existingTorrent = await repository.getTorrent(torrent).catch(() => undefined);
if (existingTorrent && existingTorrent.provider === NAME) {
if (updateSeeders) {
return updateCurrentSeeders(torrent).then(updatedSeeders => checkAndUpdateTorrent(updatedSeeders))
}
return Promise.resolve(torrent)
}
return updateTorrentSize(torrent)
.then(updated => updateCurrentSeeders(updated))
.then(updated => createTorrentEntry(updated, true))
.catch(error => console.warn(`Failed creating entry for ${torrent.title}:`, error));
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,175 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const defaultProxies = [
'https://katcr.co'
];
const defaultTimeout = 10000;
const Categories = {
MOVIE: 'movies',
TV: 'tv',
ANIME: 'anime',
APPS: 'applications',
GAMES: 'games',
MUSIC: 'music',
BOOKS: 'books',
PORN: 'xxx',
OTHER: 'other',
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
const proxyList = config.proxyList || defaultProxies;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`, config)))
.then((body) => parseTorrentPage(body))
.then((torrent) => ({ torrentId, ...torrent }))
.catch((err) => torrent(torrentId, config, retries - 1));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/search/${keyword}/${page}/99/${category}`, config)))
.then((body) => parseTableBody(body))
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/category/${category}/page/${page}`, config)))
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('Access Denied')) {
console.log(`Access Denied: ${requestUrl}`);
throw new Error(`Access Denied: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden') ||
body.includes('Origin DNS error') ||
!body.includes('Kickass Torrents</title>')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('.table > tbody > tr').each((i, element) => {
const row = $(element);
const magnetLink = row.find('a[title="Torrent magnet link"]').attr('href');
torrents.push({
name: row.find('a[class="torrents_table__torrent_title"]').first().children('b').text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
torrentId: row.find('a[class="torrents_table__torrent_title"]').first().attr('href').replace('/torrent/', ''),
category: row.find('span[class="torrents_table__upload_info"]').first().children('a').first().attr('href')
.match(/category\/([^\/]+)/)[1],
seeders: parseInt(row.find('td[data-title="Seed"]').first().text()),
leechers: parseInt(row.find('td[data-title="Leech"]').first().text()),
size: parseSize(row.find('td[data-title="Size"]').first().text()),
uploadDate: moment(row.find('td[data-title="Age"]').first().attr('title')).toDate()
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const content = $('div[class="col"]').first();
const info = content.find('div[class="torrent_stats"]').parent();
const description = content.find('div[id="main"]');
const magnetLink = info.find('a[title="Download verified Magnet"]').attr('href');
const imdbIdMatch = description.html().match(/imdb\.com\/title\/(tt\d+)/i);
const torrent = {
name: info.find('h1').first().text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
seeders: parseInt(info.find('span[class="torrent_stats__seed_count mr-2"]').first().text().match(/\d+/)[0], 10),
leechers: parseInt(info.find('span[class="torrent_stats__leech_count mr-2"]').first().text().match(/\d+/)[0], 10),
category: info.find('small').first().children('a').first().attr('href').match(/\/category\/([^\/]+)/)[1],
languages: description.find('span:contains(\'Audio\')').next().children().eq(0).text(),
size: parseSize(description.find('ul[class="file_list"]').first().find('li').first().contents().eq(2).text()
.match(/\(Size: (.+)\)/)[1]),
uploadDate: moment(info.find('time').first().text()).toDate(),
imdbId: imdbIdMatch && imdbIdMatch[1],
files: content.find('ul[class="file_list"]').first().find('li > ul > li[class="file_list__file"]')
.map((i, elem) => $(elem))
.map((i, ele) => ({
fileIndex: i,
name: ele.find('span > ul > li').contents().eq(1).text().trim().replace(/^.+\//g, ''),
path: ele.find('span > ul > li').contents().eq(1).text().trim(),
size: parseSize(ele.contents().eq(2).text())
})).get()
};
if (torrent.files.length >= 50) {
// a max of 50 files are displayed on the page
delete torrent.files;
}
resolve(torrent);
});
}
function parseSize(sizeText) {
if (!sizeText) {
return undefined;
}
let scale = 1;
if (sizeText.includes('GB')) {
scale = 1024 * 1024 * 1024
} else if (sizeText.includes('MB')) {
scale = 1024 * 1024;
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
scale = 1024;
}
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,98 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const LineByLineReader = require('line-by-line');
const fs = require('fs');
const { Type } = require('../../lib/types');
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
const NAME = 'KickassTorrents';
const CSV_FILE_PATH = '/tmp/kickass.csv';
const limiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() {
console.log(`starting to scrape KAT dump: ${JSON.stringify(lastDump)}`);
let entriesProcessed = 0;
const lr = new LineByLineReader(CSV_FILE_PATH);
lr.on('line', (line) => {
if (entriesProcessed % 1000 === 0) {
console.log(`Processed ${entriesProcessed} entries`);
}
const row = line.match(/(?<=^|\|)(".*"|[^|]+)(?=\||$)/g);
if (row.length !== 11) {
console.log(`Invalid row: ${line}`);
return;
}
const torrent = {
infoHash: row[0].toLowerCase(),
title: row[1]
.replace(/^"|"$/g, '')
.replace(/&amp;/g, '&')
.replace(/&\w{2,6};/g, ' ')
.replace(/\s+/g, ' ')
.trim(),
category: row[2],
size: parseInt(row[5], 10),
seeders: parseInt(row[8], 10),
uploadDate: moment.unix(parseInt(row[10], 10)).toDate(),
};
if (!limiter.empty()) {
lr.pause()
}
limiter.schedule(() => processTorrentRecord(torrent)
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
.then(() => limiter.empty())
.then((empty) => empty && lr.resume())
.then(() => entriesProcessed++);
});
lr.on('error', (err) => {
console.log(err);
});
lr.on('end', () => {
fs.unlink(CSV_FILE_PATH);
console.log(`finished to scrape KAT dump: ${JSON.stringify(lastDump)}!`);
});
}
const categoryMapping = {
"Movies": Type.MOVIE,
"TV": Type.SERIES,
"Anime": Type.ANIME
};
async function processTorrentRecord(record) {
if (!categoryMapping[record.category] || record.seeders === 0) {
return createSkipTorrentEntry(record);
}
if (await getStoredTorrentEntry(record)) {
return;
}
const torrentFound = await findTorrent(record).catch(() => undefined);
if (!torrentFound) {
return createSkipTorrentEntry(record);
}
const torrent = {
infoHash: record.infoHash,
provider: NAME,
title: torrentFound.name,
size: record.size,
type: categoryMapping[record.category],
imdbId: torrentFound.imdbId,
uploadDate: record.uploadDate,
seeders: torrentFound.seeders,
};
return createTorrentEntry(torrent);
}
async function findTorrent(record) {
return Promise.reject("not found");
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,91 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const kickass = require('./kickass_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'KickassTorrents';
const UNTIL_PAGE = 10;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => kickass.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
kickass.Categories.MOVIE,
kickass.Categories.TV,
kickass.Categories.ANIME,
];
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return kickass.browse(({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
const torrentFound = await kickass.torrent(record.torrentId).catch(() => undefined);
if (!torrentFound || !TYPE_MAPPING[torrentFound.category]) {
return Promise.resolve('Invalid torrent record');
}
const torrent = {
infoHash: torrentFound.infoHash,
provider: NAME,
torrentId: torrentFound.torrentId,
title: torrentFound.name.replace(/\t|\s+/g, ' '),
type: TYPE_MAPPING[torrentFound.category],
size: torrentFound.size,
seeders: torrentFound.seeders,
uploadDate: torrentFound.uploadDate,
imdbId: torrentFound.imdbId,
languages: torrentFound.languages || undefined
};
return createTorrentEntry(torrent).then(() => torrent);
}
function typeMapping() {
const mapping = {};
mapping[kickass.Categories.MOVIE] = Type.MOVIE;
mapping[kickass.Categories.TV] = Type.SERIES;
mapping[kickass.Categories.ANIME] = Type.ANIME;
return mapping;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,141 @@
const needle = require("needle")
const moment = require("moment")
const cheerio = require("cheerio");
const decode = require('magnet-uri');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper')
const defaultTimeout = 10000;
const maxSearchPage = 50
const baseUrl = 'https://lapumia.org';
const Categories = {
MOVIE: null,
TV: 'series',
ANIME: 'animes',
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = torrentId.split('?p=')[1];
return singleRequest(`${baseUrl}/?p=${slug}`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
.catch((err) => {
console.warn(`Failed Lapumia ${slug} request: `, err);
return torrent(torrentId, config, retries - 1)
});
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
return singleRequest(`${baseUrl}/page/${page}/?s=${keyword}`, config)
.then(body => parseTableBody(body))
.then(torrents => torrents.length === 10 && page < extendToPage
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
const requestUrl = category ? `${baseUrl}/${category}/page/${page}/` : `${baseUrl}/page/${page}/`
return singleRequest(requestUrl, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('div.post').each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("div > a").text(),
torrentId: row.find("div > a").attr("href")
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const magnets = $('h2 > span')
.filter((i, elem) => isPtDubbed($(elem).text())).parent()
.map((i, elem) => $(elem).nextUntil('h2, hr'))
.map((i, elem) => $(elem).find('a[href^="magnet"]'))
.map((i, section) => $(section).attr("href")).get();
const category = parseCategory($('div.category').html());
const details = $('div.content')
const torrents = magnets.map(magnetLink => ({
title: sanitizePtName(escapeHTML(decode(magnetLink).name.replace(/\+/g, ' '))),
originalName: sanitizePtOriginalName(details.find('b:contains(\'Titulo Original:\')')[0].nextSibling.nodeValue),
year: details.find('b:contains(\'Ano de Lançamento:\')')[0].nextSibling.nodeValue.trim(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
category: category,
uploadDate: new Date(moment($('div.infos').text().split('•')[0].trim(), 'LL', 'pt-br').format()),
imdbId: $('.imdbRatingPlugin').attr('data-title') || null,
languages: sanitizePtLanguages(details.find('b:contains(\'Idioma\')')[0].nextSibling.nodeValue)
}))
resolve(torrents.filter((x) => x));
});
}
function parseCategory(categorys) {
const $ = cheerio.load(categorys)
if ($('a:contains(\'Animes\')').text()) {
return Categories.ANIME
}
if ($('a:contains(\'Series\')').text()) {
return Categories.TV
}
return Categories.MOVIE
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,114 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const lapumia = require("./lapumia_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const { getImdbId } = require("../../lib/metadata");
const NAME = "Lapumia";
const UNTIL_PAGE = 5;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => lapumia.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
lapumia.Categories.MOVIE
];
return Promises.sequence(allowedCategories
.map((category) => () => scrapeLatestTorrentsForCategory(category)))
.then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return lapumia
.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
return lapumia.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) {
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
imdbId: foundTorrent.imdbId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function typeMapping() {
const mapping = {};
mapping[lapumia.Categories.MOVIE] = Type.MOVIE;
mapping[lapumia.Categories.TV] = Type.SERIES;
mapping[lapumia.Categories.ANIME] = Type.ANIME;
return mapping;
}
function untilPage(category) {
if (lapumia.Categories.TV === category) {
return 5;
}
if (lapumia.Categories.ANIME === category) {
return 2;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,73 @@
const { pantsu } = require('nyaapi')
const Categories = {
ANIME: {
ALL: '3_',
ENGLISH: '3_5',
RAW: '3_6',
MUSIC_VIDEO: '3_12',
NON_ENGLISH: '3_13',
},
LIVE_ACTION: {
ALL: '5_',
ENGLISH: '5_9',
RAW: '5_11',
PROMOTIONAL_VIDEO: '5_10',
NON_ENGLISH: '5_18',
}
}
function torrent(torrentId) {
if (!torrentId) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
return pantsu.infoRequest(torrentId)
.then(result => parseTorrent(result))
.catch(error => handleError(error, torrentId));
}
function search(query) {
return pantsu.search(query)
.then(results => results.map(torrent => parseTorrent(torrent)))
.catch(error => handleError(error, query));
}
function browse(config = {}) {
const page = config.page || 1;
const category = config.category || Categories.ANIME.ENGLISH;
return pantsu.list(category, page)
.then(results => results.map(torrent => parseTorrent(torrent)))
.catch(error => handleError(error, category));
}
function handleError(error, identifier) {
if (error.statusCode && error.statusCode >= 400) {
return Promise.reject(new Error(`${error.statusCode}: [${identifier}] failed retrieval on NyaaPantsu`));
}
return Promise.reject(error);
}
function parseTorrent(torrent) {
return {
title: torrent.name.replace(/\t|\s+/g, ' ').trim(),
torrentId: torrent.id,
infoHash: torrent.hash.trim().toLowerCase(),
magnetLink: torrent.magnet,
torrentLink: torrent.torrent,
seeders: torrent.seeders,
size: torrent.filesize,
uploadDate: new Date(torrent.date),
category: `${torrent.category}_${torrent.sub_category}`,
languages: torrent.languages ? torrent.languages.join(',') : undefined,
files: torrent.file_list && torrent.file_list.length ? torrent.file_list.map((file, fileId) => ({
fileIndex: fileId,
name: file.path.replace(/([^\/]+$)/, '$1'),
path: file.path,
size: file.filesize
})) : undefined
}
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,97 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const pantsu = require('./nyaa_pantsu_api');
const { Type } = require('../../lib/types');
const Promises = require('../../lib/promises');
const repository = require('../../lib/repository');
const { updateCurrentSeeders, updateTorrentSize } = require('../../lib/torrent');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'NyaaPantsu';
const UNTIL_PAGE = 5
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
// const ids = ['1033095'];
// return Promise.all(ids.map(id => limiter.schedule(() => pantsu.torrent(id)
// .then(torrent => processTorrentRecord(torrent)))))
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => pantsu.torrent(torrent.torrentId))
.then(foundTorrent => {
if (Number.isInteger(foundTorrent.seeders)) {
return [foundTorrent];
}
return []
});
}
async function scrapeLatestTorrents() {
const allowedCategories = [
pantsu.Categories.ANIME.ENGLISH
];
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return pantsu.browse(({ page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)
.catch(error => {
console.warn(`Failed processing [${torrent.infoHash}] ${torrent.title} due: `, error);
return Promise.resolve();
})))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (!record || await checkAndUpdateTorrent(record)) {
return record;
}
if (!record.size) {
await updateTorrentSize(record)
}
if (record.seeders === null || record.seeders === undefined) {
await updateCurrentSeeders(record);
}
const torrent = {
infoHash: record.infoHash,
torrentLink: record.torrentLink,
provider: NAME,
torrentId: record.torrentId,
title: record.title,
type: Type.ANIME,
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
languages: record.languages,
files: record.files || undefined
};
return createTorrentEntry(torrent).then(() => torrent);
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,80 @@
const { si } = require('nyaapi')
const Categories = {
ANIME: {
ALL: '1_0',
MUSIC_VIDEO: '1_1',
ENGLISH: '1_2',
NON_ENGLISH: '1_3',
RAW: '1_4'
},
LIVE_ACTION: {
ALL: '4_0',
ENGLISH: '4_1',
PROMOTIONAL_VIDEO: '4_2',
NON_ENGLISH: '4_3',
RAW: '4_4'
}
}
function torrent(torrentId) {
if (!torrentId) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
return si.infoRequest(torrentId)
.then(result => parseTorrent(result))
.then(result => ({ ...result, torrentId }))
.catch(error => {
if (error.statusCode && error.statusCode === 404) {
return Promise.reject(new Error(`404: [${torrentId}] not found on NyaaSi`));
}
return Promise.reject(error);
});
}
function search(query) {
return si.search(query)
.then(results => results.map(torrent => parseTorrent(torrent)));
}
function browse(config = {}) {
const page = config.page || 1;
const category = config.category || Categories.ANIME.ENGLISH;
const sort = config.sort || 'id'
return si.list(category, page, { sort })
.then(response => response.results || [])
.then(results => results.map(torrent => parseTorrent(torrent)));
}
function parseTorrent(torrent) {
return {
title: torrent.name.replace(/\t|\s+/g, ' ').trim(),
torrentId: torrent.id,
infoHash: torrent.hash.trim().toLowerCase(),
magnetLink: torrent.magnet,
torrentLink: torrent.torrent,
seeders: parseInt(torrent.seeders),
size: parseSize(torrent.filesize),
uploadDate: new Date(torrent.date),
category: torrent.sub_category,
}
}
function parseSize(sizeText) {
if (!sizeText) {
return undefined;
}
let scale = 1;
if (sizeText.includes('GiB')) {
scale = 1024 * 1024 * 1024
} else if (sizeText.includes('MiB')) {
scale = 1024 * 1024;
} else if (sizeText.includes('KiB') || sizeText.includes('kB')) {
scale = 1024;
}
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,83 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const nyaasi = require('./nyaa_si_api');
const { Type } = require('../../lib/types');
const Promises = require('../../lib/promises');
const repository = require('../../lib/repository');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'NyaaSi';
const UNTIL_PAGE = 10
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
// const ids = ['1292786'];
// return Promise.all(ids.map(id => limiter.schedule(() => nyaasi.torrent(id)
// .then(torrent => processTorrentRecord(torrent)))))
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
// return limiter.schedule(() => nyaasi.torrent(torrent.torrentId))
// .then(foundTorrent => Number.isInteger(foundTorrent.seeders) ? [foundTorrent] : []);
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
const allowedCategories = [
nyaasi.Categories.ANIME.ENGLISH
];
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return nyaasi.browse({ page, sort: 'id' })
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)
.catch(error => {
console.warn(`Failed processing [${torrent.infoHash}] ${torrent.title} due: `, error);
return Promise.resolve();
})))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (!record || await checkAndUpdateTorrent(record)) {
return record;
}
const torrent = {
infoHash: record.infoHash,
torrentLink: record.torrentLink,
provider: NAME,
torrentId: record.torrentId,
title: record.title,
type: Type.ANIME,
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
};
return createTorrentEntry(torrent).then(() => torrent);
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,150 @@
const needle = require("needle")
const cheerio = require("cheerio");
const decode = require('magnet-uri');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages } = require('../scraperHelper')
const defaultTimeout = 10000;
const maxSearchPage = 50
const baseUrl = 'https://ondebaixa.com';
const Categories = {
MOVIE: 'filmes',
TV: 'series',
ANIME: 'anime',
DESENHOS: 'desenhos'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} query`));
}
const slug = encodeURIComponent(torrentId.split("/")[3]);
return singleRequest(`${baseUrl}/${slug}/`, config)
.then((body) => parseTorrentPage(body))
.then((torrent) => torrent.map(el => ({ torrentId: slug, ...el })))
.catch((err) => {
console.warn(`Failed OndeBaixo ${slug} request: `, err);
return torrent(torrentId, config, retries - 1)
});
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const page = config.page || 1;
const extendToPage = Math.min(maxSearchPage, (config.extendToPage || 1))
return singleRequest(`${baseUrl}/${keyword}/${page}/`, config)
.then(body => parseTableBody(body))
.then(torrents => torrents.length === 40 && page < extendToPage
? search(keyword, { ...config, page: page + 1 }).catch(() => [])
.then(nextTorrents => torrents.concat(nextTorrents))
: torrents)
.catch((err) => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
const requestUrl = category ? `${baseUrl}/${category}/${page}/` : `${baseUrl}/${page}/`;
return singleRequest(requestUrl, config)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('div.capa_larga.align-middle').each((i, element) => {
const row = $(element);
torrents.push({
name: row.find("a").text(),
torrentId: row.find("a").attr("href")
});
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const magnets = $(`a[href^="magnet"]`)
.filter((i, elem) => isPtDubbed($(elem).attr('title')))
.map((i, elem) => $(elem).attr("href")).get();
const details = $('div#informacoes')
const category = details.find('span:contains(\'Gêneros: \')').next().html()
const torrents = magnets.map(magnetLink => {
const decodedMagnet = decode(magnetLink);
const name = escapeHTML(decodedMagnet.name || '').replace(/\+/g, ' ');
const originalTitle = details.find('span:contains(\'Título Original: \')').next().text().trim();
const year = details.find('span:contains(\'Ano de Lançamento: \')').next().text().trim();
const fallbackTitle = `${originalTitle} ${year}`;
return {
title: name ? sanitizePtName(name) : fallbackTitle,
originalName: sanitizePtOriginalName(originalTitle),
year: year,
infoHash: decodedMagnet.infoHash,
magnetLink: magnetLink,
category: parseCategory(category),
uploadDate: new Date($('time').attr('datetime')),
languages: sanitizePtLanguages(details.find('span:contains(\'Idioma\')').next().text())
}
});
resolve(torrents.filter((x) => x));
});
}
function parseCategory(body) {
const $ = cheerio.load(body)
if ($("a[href*='anime']").text()) {
return Categories.ANIME
}
if ($("a[href*='series']").text()) {
return Categories.TV
}
if ($("a[href*='filmes']").text()) {
return Categories.MOVIE
}
if ($("a[href*='desenhos']").text()) {
return Categories.TV
}
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,119 @@
const moment = require("moment");
const Bottleneck = require("bottleneck");
const ondebaixa = require("./ondebaixa_api");
const { Type } = require("../../lib/types");
const repository = require("../../lib/repository");
const Promises = require("../../lib/promises");
const { createTorrentEntry, checkAndUpdateTorrent } = require("../../lib/torrentEntries");
const { updateCurrentSeeders, updateTorrentSize } = require("../../lib/torrent");
const { getImdbId } = require("../../lib/metadata");
const NAME = "OndeBaixa";
const UNTIL_PAGE = 5;
const TYPE_MAPPING = typeMapping();
const limiter = new Bottleneck({ maxConcurrent: 5 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => ondebaixa.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
const allowedCategories = [
ondebaixa.Categories.MOVIE,
ondebaixa.Categories.TV,
ondebaixa.Categories.DESENHOS
];
return Promises.sequence(
allowedCategories.map(
(category) => () => scrapeLatestTorrentsForCategory(category)
)
).then((entries) => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return ondebaixa
.browse({ category, page })
.catch((error) => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then((torrents) => Promise.all(torrents.map((torrent) => limiter.schedule(() => processEntry(torrent)))))
.then((resolved) => resolved.length > 0 && page < untilPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processEntry(entry) {
return ondebaixa.torrent(entry.torrentId)
.then(records => Promises.sequence(records.map(record => () => processTorrentRecord(record))))
.catch(() => undefined);
}
async function processTorrentRecord(foundTorrent) {
if (await checkAndUpdateTorrent({ provider: NAME, ...foundTorrent })) {
return foundTorrent;
}
if (!foundTorrent.size) {
await updateTorrentSize(foundTorrent);
}
if (!Number.isInteger(foundTorrent.seeders)) {
await updateCurrentSeeders(foundTorrent);
}
if (!foundTorrent.imdbId && TYPE_MAPPING[foundTorrent.category] !== Type.ANIME) {
const info = { title: foundTorrent.originalName, year: foundTorrent.year };
foundTorrent.imdbId = await getImdbId(info, TYPE_MAPPING[foundTorrent.category]).catch(() => undefined);
}
const torrent = {
infoHash: foundTorrent.infoHash,
provider: NAME,
torrentId: foundTorrent.torrentId,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
imdbId: foundTorrent.imdbId,
uploadDate: foundTorrent.uploadDate,
seeders: foundTorrent.seeders,
size: foundTorrent.size,
files: foundTorrent.files,
languages: foundTorrent.languages
};
return createTorrentEntry(torrent);
}
function typeMapping() {
const mapping = {};
mapping[ondebaixa.Categories.MOVIE] = Type.MOVIE;
mapping[ondebaixa.Categories.TV] = Type.SERIES;
mapping[ondebaixa.Categories.DESENHOS] = Type.SERIES;
mapping[ondebaixa.Categories.ANIME] = Type.ANIME;
return mapping;
}
function untilPage(category) {
if (ondebaixa.Categories.DESENHOS === category) {
return 5;
}
if (ondebaixa.Categories.TV === category) {
return 5;
}
return UNTIL_PAGE;
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,147 @@
const needle = require('needle');
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const baseUrl = 'https://torrentapi.org/pubapi_v2.php';
const appId = 'torrentio-addon';
const defaultTimeout = 30000;
let token;
const Options = {
category: {
MOVIES_XVID: [14],
MOVIES_XVID_720P: [48],
MOVIES_X264: [17],
MOVIES_X264_1080P: [44],
MOVIES_X264_720P: [45],
MOVIES_X264_3D: [47],
MOVIES_X264_4K: [50],
MOVIES_X265_1080P: [54],
MOVIES_X265_4K: [51],
MOVIES_X265_4K_HDR: [52],
MOVIES_FULL_BD: [42],
MOVIES_BD_REMUX: [46],
TV_EPISODES: [18],
TV_UHD_EPISODES: [49],
TV_HD_EPISODES: [41],
MUSIC_MP3: [23],
MUSIC_FLAC: [25],
GAMES_PC_ISO: [27],
GAMES_PC_RIP: [28],
GAMES_PS3: [40],
GAMES_XBOX_360: [32],
SOFTWARE_PC_ISO: [33],
EBOOKS: [35],
XXX: [4],
},
sort: {
LAST: 'last',
SEEDERS: 'seeders',
LEECHERS: 'leechers'
},
format: {
JSON: 'json',
JSON_EXTENDED: 'json_extended'
},
ranked: {
TRUE: 1,
FALSE: 0
}
}
function search(imdbId, params = {}) {
if (!imdbId) {
return Promise.reject(new Error(`Must define imdbId`));
}
const parameters = {
mode: 'search',
search_imdb: imdbId,
category: params.category && params.category.join(';') || null,
limit: params.limit || 100,
sort: params.sort || Options.sort.SEEDERS,
min_seeders: params.min_seeders || undefined,
min_leechers: params.min_leechers || undefined,
format: params.format || Options.format.JSON_EXTENDED,
ranked: params.ranked || Options.ranked.FALSE
}
return singleRequest(parameters).then(results => parseResults(results));
}
function browse(params = {}) {
const parameters = {
mode: 'list',
category: params.category && params.category.join(';') || null,
limit: params.limit || 100,
sort: params.sort || Options.sort.LAST,
min_seeders: params.min_seeders || undefined,
min_leechers: params.min_leechers || undefined,
format: params.format || Options.format.JSON_EXTENDED,
ranked: params.ranked || Options.ranked.FALSE
}
return singleRequest(parameters).then(results => parseResults(results));
}
async function singleRequest(params = {}, config = {}, retries = 10) {
const timeout = config.timeout || defaultTimeout;
const options = { open_timeout: timeout, follow: 2 };
params.token = await getToken();
params.app_id = appId;
Object.keys(params)
.filter(key => params[key] === undefined || params[key] === null)
.forEach(key => delete params[key]);
return needle('get', baseUrl, params, options)
.then(response => {
if (response.body && response.body.error_code === 4) {
// token expired
token = undefined;
return singleRequest(params, config);
}
if ((!response.body || !response.body.length || [5, 20].includes(response.body.error_code)) && retries > 0) {
// too many requests
return Promises.delay(3000).then(() => singleRequest(params, config, retries - 1));
}
if (response.statusCode !== 200 || (response.body && response.body.error)) {
// something went wrong
return Promise.reject(response.body || `Failed RARGB request with status=${response.statusCode}`);
}
return response.body;
});
}
function parseResults(results) {
if (!results || !Array.isArray(results.torrent_results)) {
return Promise.reject(`Incorrect results ${JSON.stringify(results)}`)
}
return results.torrent_results.map(result => parseResult(result));
}
function parseResult(result) {
return {
title: result.title,
infoHash: decode(result.download).infoHash,
magnetLink: result.download,
seeders: result.seeders,
leechers: result.leechers,
category: result.category,
size: result.size,
uploadDate: new Date(result.pubdate),
imdbId: result.episode_info && result.episode_info.imdb
}
}
async function getToken() {
if (!token) {
const options = { open_timeout: defaultTimeout };
token = await needle('get', baseUrl, { get_token: 'get_token', app_id: appId }, options)
.then(response => response.body.token);
}
return token;
}
module.exports = { search, browse, Options };

View File

@@ -0,0 +1,84 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const rarbg = require('./rarbg_api');
const { Type } = require('../../lib/types');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'RARBG';
const limiter = new Bottleneck({ maxConcurrent: 1, minTime: 3000 });
const entryLimiter = new Bottleneck({ maxConcurrent: 20 });
const allowedCategories = [
rarbg.Options.category.MOVIES_XVID,
rarbg.Options.category.MOVIES_XVID_720P,
rarbg.Options.category.MOVIES_X265_1080P,
rarbg.Options.category.MOVIES_X265_4K,
rarbg.Options.category.MOVIES_X265_4K_HDR,
rarbg.Options.category.MOVIES_X264,
rarbg.Options.category.MOVIES_X264_720P,
rarbg.Options.category.MOVIES_X264_1080P,
rarbg.Options.category.MOVIES_X264_3D,
rarbg.Options.category.MOVIES_X264_4K,
rarbg.Options.category.MOVIES_BD_REMUX,
rarbg.Options.category.TV_EPISODES,
rarbg.Options.category.TV_UHD_EPISODES,
rarbg.Options.category.TV_HD_EPISODES
].reduce((a, b) => a.concat(b), [])
async function scrape() {
console.log(`[${moment()}] starting ${NAME} dump scrape...`);
// const movieImdbIds = require('./rargb_movie_imdb_ids_2021-02-27.json');
const seriesImdbIds = require('./rargb_series_imdb_ids_2021-02-27.json');
//const allImdbIds = [].concat(movieImdbIds).concat(seriesImdbIds);
return Promise.all(
seriesImdbIds.map(imdbId => limiter.schedule(() => getTorrentsForImdbId(imdbId))
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))))
.then(() => console.log(`[${moment()}] finished ${NAME} dump scrape`));
}
async function getTorrentsForImdbId(imdbId) {
return rarbg.search(imdbId, { category: allowedCategories })
.then(torrents => {
console.log(`Completed ${imdbId} request`);
return torrents;
})
.catch(error => {
console.warn(`Failed ${NAME} request for ${imdbId}: `, error);
return [];
});
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
const torrent = {
provider: NAME,
infoHash: record.infoHash,
title: record.title,
type: getType(record.category),
seeders: record.seeders,
size: record.size,
uploadDate: record.uploadDate,
imdbId: record.imdbId
};
return createTorrentEntry(torrent);
}
const seriesCategories = [
'TV Episodes',
'Movies/TV-UHD-episodes',
'TV HD Episodes',
];
function getType(category) {
if (seriesCategories.includes(category)) {
return Type.SERIES;
}
return Type.MOVIE;
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,99 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const rarbg = require('./rarbg_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'RARBG';
const limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
const entryLimiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent, getImdbIdsMethod) {
// return getImdbIdsMethod()
// .then(imdbIds => Promise.all(imdbIds.map(imdbId => limiter.schedule(() => search(imdbId)))))
// .then(results => results.reduce((a, b) => a.concat(b), []));
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
const allowedCategories = [
rarbg.Options.category.MOVIES_XVID,
rarbg.Options.category.MOVIES_XVID_720P,
rarbg.Options.category.MOVIES_X265_1080P,
rarbg.Options.category.MOVIES_X265_4K,
rarbg.Options.category.MOVIES_X265_4K_HDR,
rarbg.Options.category.MOVIES_X264,
rarbg.Options.category.MOVIES_X264_720P,
rarbg.Options.category.MOVIES_X264_1080P,
rarbg.Options.category.MOVIES_X264_3D,
rarbg.Options.category.MOVIES_X264_4K,
rarbg.Options.category.MOVIES_BD_REMUX,
rarbg.Options.category.TV_EPISODES,
rarbg.Options.category.TV_UHD_EPISODES,
rarbg.Options.category.TV_HD_EPISODES
];
return Promises.sequence(allowedCategories
.map(category => () => limiter.schedule(() => scrapeLatestTorrentsForCategory(category))))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category) {
console.log(`Scrapping ${NAME} ${category} category`);
return rarbg.browse({ category: category })
.then(torrents => Promise.all(torrents.map(t => entryLimiter.schedule(() => processTorrentRecord(t)))))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for ${category} due: `, error);
return Promise.resolve([]);
});
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
const torrent = {
provider: NAME,
infoHash: record.infoHash,
title: record.title,
type: getType(record.category),
seeders: record.seeders,
size: record.size,
uploadDate: record.uploadDate,
imdbId: record.imdbId
};
return createTorrentEntry(torrent);
}
const seriesCategories = [
'TV Episodes',
'Movies/TV-UHD-episodes',
'TV HD Episodes',
];
function getType(category) {
if (seriesCategories.includes(category)) {
return Type.SERIES;
}
return Type.MOVIE;
}
module.exports = { scrape, updateSeeders, NAME };

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,206 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const decode = require('magnet-uri');
const { defaultOptionsWithProxy } = require('../../lib/requestHelper');
const baseUrl = 'http://www.rutor.info';
const defaultTimeout = 10000;
const Categories = {
FOREIGN_FILMS: '1',
RUSSIAN_FILMS: '5',
SCIENCE_FILMS: '12',
FOREIGN_SERIES: '4',
RUSSIAN_SERIES: '16',
RUSSIAN_TV: '6',
RUSSIAN_ANIMATION: '7',
ANIME: '10',
FOREIGN_RELEASES: '17'
};
function torrent(torrentId, config = {}, retries = 2, error = null) {
if (!torrentId || retries === 0) {
return Promise.reject(error || new Error(`Failed ${torrentId} search`));
}
return singleRequest(`${baseUrl}/torrent/${torrentId}`)
.then((body) => parseTorrentPage(body, torrentId))
.catch((err) => torrent(torrentId, config, retries - 1, err));
}
function search(query, retries = 2, error = null) {
if (retries === 0) {
return Promise.reject(error || new Error(`Failed browse request`));
}
return singleRequest(`${baseUrl}/search/0/0/0/0/${encodeURIComponent(query)}`)
.then((body) => parseTableBody(body))
.catch((err) => search(query, retries - 1, err));
}
function browse(config = {}, retries = 2, error = null) {
if (retries === 0) {
return Promise.reject(error || new Error(`Failed browse request`));
}
const page = config.page || 1;
const category = config.category;
return singleRequest(`${baseUrl}/browse/${page - 1}/${category}/0/0`)
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1, err));
}
function files(torrentId) {
return singleRequest(`${baseUrl}/descriptions/${torrentId}.files`)
.then((body) => parseFiles(body));
}
function singleRequest(requestUrl) {
const options = { ...defaultOptionsWithProxy(), open_timeout: defaultTimeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
} else if (body.includes('Access Denied')) {
console.log(`Access Denied: ${requestUrl}`);
throw new Error(`Access Denied: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden') ||
body.includes('Origin DNS error')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = $('#index').find('tr:not(.backgr)').map((i, elem) => {
const row = $(elem).find('td');
const links = $(row[1]).find('a');
const peers = $(row[row.length - 1]);
const magnetLink = $(links[1]).attr('href');
return {
title: $(links[2]).text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
torrentLink: $(links[0]).attr('href'),
torrentId: $(links[2]).attr('href').match(/torrent\/(\d+)/)[1],
seeders: parseInt(peers.find('.green').text()),
leechers: parseInt(peers.find('.red').text()),
uploadDate: parseRussianDate($(row[0]).text()),
size: $(row[row.length - 2]).html().replace('&#xA0;', ' '),
}
}).get();
resolve(torrents);
});
}
function parseTorrentPage(body, torrentId) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const rows = $('#details > tr')
const details = $(rows[0]).find('td:nth-of-type(2)');
const magnetLink = $('#download a:nth-of-type(1)').attr('href');
const imdbIdMatch = details.html().match(/imdb\.com\/title\/(tt\d+)/i);
const parsedTorrent = {
title: $('#all h1').first().text(),
torrentId: torrentId,
infoHash: decode(magnetLink).infoHash,
trackers: Array.from(new Set(decode(magnetLink).tr)).join(','),
magnetLink: magnetLink,
torrentLink: $('#download a:nth-of-type(2)').attr('href'),
seeders: parseInt($(rows[rows.length - 8]).find('td:nth-of-type(2)').first().text(), 10),
category: $('tr:contains(\'Категория\') a').first().attr('href').match(/\/([\w-]+)$/)[1],
languages: parseLanguages(details.text()),
size: parseSize($(rows[rows.length - 4]).find('td:nth-of-type(2)').text()),
uploadDate: parseDate($(rows[rows.length - 5]).find('td:nth-of-type(2)').first().text()),
imdbId: imdbIdMatch && imdbIdMatch[1]
};
resolve(parsedTorrent);
});
}
function parseFiles(body) {
if (!body) {
throw new Error("No files in the body");
}
return body.split('\n')
.map((item) => item.match(/<td>([^<]+)<\/td>/g).slice(1))
.map((item, index) => ({
fileIndex: index,
name: item[0].replace(/^.+\//g, ''),
path: item[0].replace(/^.+\//, ''),
size: parseSize(item[1])
}));
}
function parseDate(dateString) {
const preparedDate = dateString.replace(/\s\(.*\)/, '')
return moment(preparedDate, 'DD-MM-YYYY HH:mm:ss').toDate();
}
const russianMonths = {
'Янв': 'Jan',
'Фев': 'Feb',
'Мар': 'Mar',
'Апр': 'Apr',
'Май': 'May',
'Июн': 'Jun',
'Июл': 'Jul',
'Авг': 'Aug',
'Сен': 'Sep',
'Окт': 'Oct',
'Ноя': 'Nov',
'Дек': 'Dec'
};
function parseRussianDate(dateString) {
const rusMonth = Object.keys(russianMonths).find(month => dateString.includes(month));
const preparedDate = dateString.trim().replace(rusMonth, russianMonths[rusMonth]).replace(/\u00a0/g, ' ');
return moment(preparedDate, 'DD MMM YY').toDate();
}
function parseSize(sizeString) {
return parseInt(sizeString.match(/\((\d+) Bytes\)/)[1], 10);
}
const languageMatchers = {
'russian': /(?:Язык|Звук|Аудио|audio|language).*(russian|\brus?\b|[Рр]усский)/i,
'english': /(?:Язык|Звук|Аудио|audio|language).*(english|\beng?\b|[Аа]нглийский)/i,
'ukrainian': /(?:Язык|Звук|Аудио|audio|language).*(ukrainian|\bukr\b|украинский)/i,
'french': /(?:Язык|Звук|Аудио|audio|language).*(french|\bfr\b|французский)/i,
'spanish': /(?:Язык|Звук|Аудио|audio|language).*(spanish|\bspa\b|испанский)/i,
'italian': /(?:Язык|Звук|Аудио|audio|language).*(italian|\bita\b|итальянский)/i,
'german': /(?:Язык|Звук|Аудио|audio|language).*(german|\bger\b|Немецкий)/i,
'korean': /(?:Язык|Звук|Аудио|audio|language).*(korean|Корейский)/i,
'arabic': /(?:Язык|Звук|Аудио|audio|language).*(arabic|Арабский)/i,
'portuguese': /(?:Язык|Звук|Аудио|audio|language).*(portuguese|Португальский)/i,
'japanese': /(?:Язык|Звук|Аудио|audio|language).*(japanese|\bjap\b|\bjp\b|[Яя]понский)/i,
}
function parseLanguages(details) {
const subsInfoMatch = details.match(/\r?\n(Text|Текст)(?:\s?#?\d{1,2})?\r?\n/i);
const detailsPart = subsInfoMatch ? details.substring(0, subsInfoMatch.index) : details;
const matchedLanguages = Object.keys(languageMatchers).filter(lang => languageMatchers[lang].test(detailsPart));
const languages = Array.from(new Set(['russian'].concat(matchedLanguages)));
return languages.length > 4 ? 'multi-audio' : languages.join(',');
}
module.exports = { torrent, browse, search, Categories };

View File

@@ -0,0 +1,133 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const { parse } = require('parse-torrent-title');
const rutor = require('./rutor_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'Rutor';
const TYPE_MAPPING = {
'kino': Type.MOVIE,
'nashe_kino': Type.MOVIE,
'nauchno_popularnoe': Type.MOVIE,
'inostrannoe': Type.MOVIE,
'seriali': Type.SERIES,
'nashi_seriali': Type.SERIES,
'tv': Type.SERIES,
'multiki': Type.MOVIE,
'anime': Type.ANIME
};
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
const api_entry_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 2500 });
const limiter = new Bottleneck({ maxConcurrent: 10 });
const allowedCategories = [
rutor.Categories.FOREIGN_FILMS,
rutor.Categories.FOREIGN_RELEASES,
rutor.Categories.RUSSIAN_FILMS,
rutor.Categories.FOREIGN_SERIES,
rutor.Categories.RUSSIAN_SERIES,
rutor.Categories.SCIENCE_FILMS,
rutor.Categories.RUSSIAN_ANIMATION,
rutor.Categories.ANIME
];
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
// const ids = [
// '637799'
// ];
// return Promise.all(ids.map(id => api_entry_limiter.schedule(() => rutor.torrent(id))
// .then(torrent => processTorrentRecord(torrent))))
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return api_limiter.schedule(() => rutor.browse({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
const isOld = moment(record.uploadDate).isBefore(moment().subtract(18, 'month'));
if (record.seeders === 0 && isOld) {
console.log(`Skipping old unseeded torrent [${record.infoHash}] ${record.title}`)
return record;
}
const foundTorrent = await api_entry_limiter.schedule(() => rutor.torrent(record.torrentId).catch(() => undefined));
if (!foundTorrent || !TYPE_MAPPING[foundTorrent.category]) {
return Promise.resolve(`${NAME}: Invalid torrent record: ${record.torrentId}`);
}
if (!foundTorrent.imdbId && disallowWithoutImdbId(foundTorrent)) {
return Promise.resolve(`${NAME}: No imdbId defined: ${record.torrentId}`);
}
const torrent = {
provider: NAME,
infoHash: foundTorrent.infoHash,
torrentId: foundTorrent.torrentId,
torrentLink: foundTorrent.torrentLink,
trackers: foundTorrent.trackers,
title: foundTorrent.title,
type: TYPE_MAPPING[foundTorrent.category],
size: foundTorrent.size,
seeders: foundTorrent.seeders,
uploadDate: foundTorrent.uploadDate,
imdbId: foundTorrent.imdbId,
languages: foundTorrent.languages || undefined,
};
return createTorrentEntry(torrent).then(() => torrent);
}
function getMaxPage(category) {
switch (category) {
case rutor.Categories.FOREIGN_FILMS:
case rutor.Categories.FOREIGN_SERIES:
return 2;
default:
return 1;
}
}
function disallowWithoutImdbId(torrent) {
if (['kino', 'anime'].includes(torrent.category)) {
return false; // allow to search foreign movie and anime ids via search
}
// allow to search id for non russian series titles via search
return !(torrent.category === 'seriali' && !parse(torrent.title).title.match(/[\u0400-\u04ff]/i));
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,26 @@
function isPtDubbed(name) {
return name.toLowerCase().match(/dublado|dual|nacional|multi/);
}
function sanitizePtName(name) {
return name
.replace(/(.*)\b(\d{3,4}P)\b(?!.*\d{3,4}[Pp])(.*)/, '$1$3 $2') // add resolution to the end if missing
.replace(/^[\[{]?(?:ACESSE.*|WWW\.)?[A-Z]+\.(COM|NET|ORG|TO|TV|ME)\b\s*[-\]}]+[\s.]*/i, '') // replace watermarks
.replace(/^(\d*(?:\.\d{1,2})?(?:[4A-Z-]{3,}|P)[-.]+)+/, '') // replace metadata prefixes
.replace(/^[\[{]?(?:ACESSE.*|WWW\.)?[A-Z]+\.(COM|NET|ORG|TO|TV|ME)\b\s*[-\]}]+[\s.]*/i, '') // replace watermarks2
.replace(/^(COM|NET|ORG|TO|TV|ME)\b\s*-+[\s.]*/, '') // replace dangling site endings
.trim();
}
function sanitizePtOriginalName(name) {
return name.trim().replace(/S\d+$|\d.\s?[Tt]emporada/, '');
}
function sanitizePtLanguages(languages) {
return languages
.replace(/<2F><>/g, 'ê')
.replace(/ /g, '')
.trim();
}
module.exports = { isPtDubbed, sanitizePtName, sanitizePtOriginalName, sanitizePtLanguages }

View File

@@ -0,0 +1,129 @@
const needle = require('needle');
const { escapeHTML } = require('../../lib/metadata');
const baseUrl = 'https://apibay.org';
const timeout = 5000;
const Categories = {
AUDIO: {
ALL: 100,
MUSIC: 101,
AUDIO_BOOKS: 102,
SOUND_CLIPS: 103,
FLAC: 104,
OTHER: 199
},
VIDEO: {
ALL: 200,
MOVIES: 201,
MOVIES_DVDR: 202,
MUSIC_VIDEOS: 203,
MOVIE_CLIPS: 204,
TV_SHOWS: 205,
HANDHELD: 206,
MOVIES_HD: 207,
TV_SHOWS_HD: 208,
MOVIES_3D: 209,
OTHER: 299
},
APPS: {
ALL: 300,
WINDOWS: 301,
MAC: 302,
UNIX: 303,
HANDHELD: 304,
IOS: 305,
ANDROID: 306,
OTHER_OS: 399
},
GAMES: {
ALL: 400,
PC: 401,
MAC: 402,
PSx: 403,
XBOX360: 404,
Wii: 405,
HANDHELD: 406,
IOS: 407,
ANDROID: 408,
OTHER: 499
},
PORN: {
ALL: 500,
MOVIES: 501,
MOVIES_DVDR: 502,
PICTURES: 503,
GAMES: 504,
MOVIES_HD: 505,
MOVIE_CLIPS: 506,
OTHER: 599
},
OTHER: {
ALL: 600,
E_BOOKS: 601,
COMICS: 602,
PICTURES: 603,
COVERS: 604,
PHYSIBLES: 605,
OTHER: 699
}
};
function torrent(torrentId, retries = 2) {
if (!torrentId) {
return Promise.reject(new Error('No valid torrentId provided'));
}
return _request(`t.php?id=${torrentId}`)
.then(result => toTorrent(result))
.catch(error => retries ? torrent(torrentId, retries - 1) : Promise.reject(error));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword) {
return Promise.reject(new Error('No valid keyword provided'));
}
const q = keyword;
const cat = config.category || Categories.VIDEO.ALL;
return _request(`q.php?q=${q}&cat=${cat}`)
.then(results => results.map((result) => toTorrent(result)))
.catch(error => retries ? search(keyword, config, retries - 1) : Promise.reject(error));
}
function browse(config = {}, retries = 2) {
const category = config.category || 0;
const page = config.page - 1 || 0;
return _request(`q.php?q=category:${category}:${page}`)
.then(results => results.map((result) => toTorrent(result)))
.catch(error => retries ? browse(config, retries - 1) : Promise.reject(error));
}
async function _request(endpoint) {
const url = `${baseUrl}/${endpoint}`;
return needle('get', url, { open_timeout: timeout })
.then(response => {
if (typeof response.body === 'object') {
return response.body;
}
return Promise.reject(`Unexpected response body`);
});
}
function toTorrent(result) {
return {
torrentId: result.id,
name: escapeHTML(result.name),
infoHash: result.info_hash.toLowerCase(),
size: parseInt(result.size),
seeders: parseInt(result.seeders),
leechers: parseInt(result.leechers),
subcategory: parseInt(result.category),
uploadDate: new Date(result.added * 1000),
imdbId: result.imdb || undefined,
filesCount: result.num_files && parseInt(result.num_files) || undefined
};
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,175 @@
const moment = require('moment');
const needle = require('needle');
const Bottleneck = require('bottleneck');
const { ungzip } = require('node-gzip');
const LineByLineReader = require('line-by-line');
const fs = require('fs');
const thepiratebay = require('./thepiratebay_api.js');
const bing = require('nodejs-bing');
const { Type } = require('../../lib/types');
const { escapeHTML } = require('../../lib/metadata');
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
const NAME = 'ThePirateBay';
const CSV_FILE_PATH = '/tmp/tpb_dump.csv';
const limiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() {
const lastDump = { updatedAt: 2147000000 };
//const checkPoint = moment('2016-06-17 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
//const lastDump = await thepiratebay.dumps().then((dumps) => dumps.sort((a, b) => b.updatedAt - a.updatedAt)[0]);
const checkPoint = 0;
if (lastDump) {
console.log(`starting to scrape tpb dump: ${JSON.stringify(lastDump)}`);
await downloadDump(lastDump);
let entriesProcessed = 0;
const lr = new LineByLineReader(CSV_FILE_PATH);
lr.on('line', (line) => {
if (line.includes("#ADDED")) {
return;
}
if (entriesProcessed % 1000 === 0) {
console.log(`Processed ${entriesProcessed} entries`);
}
if (entriesProcessed <= checkPoint) {
entriesProcessed++;
return;
}
const row = line.match(/(?<=^|;)(".*"|[^;]+)(?=;|$)/g);
if (row.length !== 4) {
console.log(`Invalid row: ${line}`);
return;
}
const torrent = {
uploadDate: moment(row[0], 'YYYY-MMM-DD HH:mm:ss').toDate(),
infoHash: Buffer.from(row[1], 'base64').toString('hex'),
title: escapeHTML(row[2])
.replace(/^"|"$/g, '')
.replace(/&#?\w{2,6};/g, ' ')
.replace(/\s+/g, ' ')
.trim(),
size: parseInt(row[3], 10)
};
if (!limiter.empty()) {
lr.pause()
}
limiter.schedule(() => processTorrentRecord(torrent)
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
.then(() => limiter.empty())
.then((empty) => empty && lr.resume())
.then(() => entriesProcessed++);
});
lr.on('error', (err) => {
console.log(err);
});
lr.on('end', () => {
console.log(`finished to scrape tpb dump: ${JSON.stringify(lastDump)}!`);
});
}
}
const allowedCategories = [
thepiratebay.Categories.VIDEO.MOVIES,
thepiratebay.Categories.VIDEO.MOVIES_HD,
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
thepiratebay.Categories.VIDEO.MOVIES_3D,
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
const seriesCategories = [
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
async function processTorrentRecord(record) {
if (await getStoredTorrentEntry(record)) {
return;
}
const torrentFound = await findTorrent(record);
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
return createSkipTorrentEntry(record);
}
const torrent = {
infoHash: torrentFound.infoHash,
provider: NAME,
torrentId: torrentFound.torrentId,
title: torrentFound.name,
size: torrentFound.size,
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
imdbId: seriesCategories.includes(torrentFound.subcategory) && torrentFound.imdbId || undefined,
uploadDate: torrentFound.uploadDate || record.uploadDate,
seeders: torrentFound.seeders,
};
return createTorrentEntry(torrent);
}
async function findTorrent(record) {
return findTorrentInSource(record)
.catch(() => findTorrentViaBing(record));
}
async function findTorrentInSource(record) {
let page = 0;
let torrentFound;
while (!torrentFound && page < 5) {
const torrents = await thepiratebay.search(record.title.replace(/[\W\s]+/, ' '), { page: page });
torrentFound = torrents.filter(torrent => torrent.magnetLink.toLowerCase().includes(record.infoHash))[0];
page = torrents.length === 0 ? 1000 : page + 1;
}
if (!torrentFound) {
return Promise.reject(new Error(`Failed to find torrent ${record.title}`));
}
return Promise.resolve(torrentFound)
.then((torrent) => thepiratebay.torrent(torrent.torrentId)
.catch(() => thepiratebay.torrent(torrent.torrentId)));
}
async function findTorrentViaBing(record) {
return bing.web(`${record.infoHash}`)
.then((results) => results
.find(result => result.description.includes('Direct download via magnet link') ||
result.description.includes('Get this torrent')))
.then((result) => {
if (!result) {
console.warn(`Failed to find torrent ${record.title}`);
return Promise.resolve(undefined);
}
return result.link.match(/torrent\/(\w+)\//)[1];
})
.then((torrentId) => torrentId && thepiratebay.torrent(torrentId))
}
function downloadDump(dump) {
try {
if (fs.existsSync(CSV_FILE_PATH)) {
console.log('dump file already exist...');
return;
}
} catch (err) {
console.error(err)
}
console.log('downloading dump file...');
return needle('get', dump.url, { open_timeout: 2000, output: '/tmp/tpb_dump.gz' })
.then((response) => response.body)
.then((body) => {
console.log('unzipping dump file...');
return ungzip(body);
})
.then((unzipped) => {
console.log('writing dump file...');
return fs.promises.writeFile(CSV_FILE_PATH, unzipped);
})
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,43 @@
const moment = require('moment');
const { Sequelize } = require('sequelize');
const Bottleneck = require('bottleneck');
const thepiratebay = require('./thepiratebay_api.js');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const NAME = 'ThePirateBay';
const EMPTY_HASH = '0000000000000000000000000000000000000000';
const Op = Sequelize.Op;
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
console.log(`Starting ${NAME} fake removal...`);
const startCreatedAt = moment().subtract(14, 'day');
const endCreatedAt = moment();
const whereQuery = {
provider: NAME,
type: Type.MOVIE,
createdAt: { [Op.between]: [startCreatedAt, endCreatedAt] }
};
return repository.getTorrentsBasedOnQuery(whereQuery)
.then(torrents => {
console.log(`Checking for ${NAME} fake entries in ${torrents.length} torrents`);
return Promise.all(torrents.map(torrent => limiter.schedule(() => removeIfFake(torrent))))
})
.then(results => {
const removed = results.filter(result => result);
console.log(`Finished ${NAME} fake removal with ${removed.length} removals in ${results.length} torrents`);
});
}
async function removeIfFake(torrent) {
const tpbTorrentInfo = await thepiratebay.torrent(torrent.torrentId).catch(() => null);
if (tpbTorrentInfo && tpbTorrentInfo.infoHash === EMPTY_HASH) {
console.log(`Removing ${NAME} fake torrent [${torrent.torrentId}][${torrent.infoHash}] ${torrent.title}`);
return repository.deleteTorrent(torrent).catch(() => null);
}
return Promise.resolve(null);
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,87 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const thepiratebay = require('./thepiratebay_api.js');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'ThePirateBay';
const UNTIL_PAGE = 5;
const limiter = new Bottleneck({ maxConcurrent: 10 });
const allowedCategories = [
thepiratebay.Categories.VIDEO.MOVIES,
thepiratebay.Categories.VIDEO.MOVIES_HD,
thepiratebay.Categories.VIDEO.MOVIES_3D,
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
const seriesCategories = [
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
// return limiter.schedule(() => thepiratebay.torrent(torrent.torrentId));
return Promise.resolve([]);
}
async function scrapeLatestTorrents() {
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return thepiratebay.browse({ category, page })
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
if (!record || !allowedCategories.includes(record.subcategory)) {
return Promise.resolve('Invalid torrent record');
}
const torrent = {
infoHash: record.infoHash,
provider: NAME,
torrentId: record.torrentId,
title: record.name.replace(/\t|\s+/g, ' '),
type: seriesCategories.includes(record.subcategory) ? Type.SERIES : Type.MOVIE,
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
imdbId: seriesCategories.includes(record.subcategory) && record.imdbId || undefined,
languages: record.languages && record.languages.trim() || undefined
};
return createTorrentEntry(torrent);
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,112 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const LineByLineReader = require('line-by-line');
const decode = require('magnet-uri');
const thepiratebay = require('./thepiratebay_api.js');
const { Type } = require('../../lib/types');
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
const NAME = 'ThePirateBay';
const CSV_FILE_PATH = '/tmp/tpb.csv';
const limiter = new Bottleneck({ maxConcurrent: 40 });
async function scrape() {
// await processTorrentRecord({ torrentId: 26877339, category: 'Video' });
console.log(`starting to scrape tpb dump...`);
//const checkPoint = moment('2013-06-16 00:00:00', 'YYYY-MMM-DD HH:mm:ss').toDate();
const checkPoint = 4115000;
let entriesProcessed = 0;
const lr = new LineByLineReader(CSV_FILE_PATH);
lr.on('line', (line) => {
if (entriesProcessed % 1000 === 0) {
console.log(`Processed ${entriesProcessed} entries`);
}
if (entriesProcessed <= checkPoint) {
entriesProcessed++;
return;
}
const row = line.match(/(?<=^|,)(".*"|[^,]*)(?=,|$)/g);
if (row.length !== 10) {
console.log(`Invalid row: ${line}`);
return;
}
const torrent = {
torrentId: row[0],
title: row[1]
.replace(/^"|"$/g, '')
.replace(/&amp;/g, '&')
.replace(/&\w{2,6};/g, ' ')
.replace(/\s+/g, ' ')
.trim(),
size: parseInt(row[2], 10),
category: row[4],
subcategory: row[5],
infoHash: row[7].toLowerCase() || decode(row[9]).infoHash,
magnetLink: row[9],
uploadDate: moment(row[8]).toDate(),
};
if (!limiter.empty()) {
lr.pause()
}
limiter.schedule(() => processTorrentRecord(torrent)
.catch((error) => console.log(`failed ${torrent.title} due: ${error}`)))
.then(() => limiter.empty())
.then((empty) => empty && lr.resume())
.then(() => entriesProcessed++);
});
lr.on('error', (err) => {
console.log(err);
});
lr.on('end', () => {
console.log(`finished to scrape tpb dump!`);
});
}
const allowedCategories = [
thepiratebay.Categories.VIDEO.MOVIES,
thepiratebay.Categories.VIDEO.MOVIES_HD,
thepiratebay.Categories.VIDEO.MOVIES_DVDR,
thepiratebay.Categories.VIDEO.MOVIES_3D,
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
const seriesCategories = [
thepiratebay.Categories.VIDEO.TV_SHOWS,
thepiratebay.Categories.VIDEO.TV_SHOWS_HD
];
async function processTorrentRecord(record) {
if (record.category !== 'Video') {
return createSkipTorrentEntry(record);
}
if (await getStoredTorrentEntry(record)) {
return;
}
const torrentFound = await thepiratebay.torrent(record.torrentId);
if (!torrentFound || !allowedCategories.includes(torrentFound.subcategory)) {
return createSkipTorrentEntry(record);
}
const torrent = {
infoHash: torrentFound.infoHash,
provider: NAME,
torrentId: torrentFound.torrentId,
title: torrentFound.name,
size: torrentFound.size,
type: seriesCategories.includes(torrentFound.subcategory) ? Type.SERIES : Type.MOVIE,
imdbId: torrentFound.imdbId,
uploadDate: torrentFound.uploadDate,
seeders: torrentFound.seeders,
};
return createTorrentEntry(torrent);
}
module.exports = { scrape, NAME };

View File

@@ -0,0 +1,185 @@
const cheerio = require('cheerio');
const needle = require('needle');
const moment = require('moment');
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const defaultProxies = [
// 'https://torrentgalaxy.to',
// 'https://torrentgalaxy.mx',
'https://torrentgalaxy.su'
];
const defaultTimeout = 10000;
const Categories = {
ANIME: '28',
MOVIE_4K: '3',
MOVIE_PACKS: '4',
MOVIE_SD: '1',
MOVIE_HD: '42',
MOVIE_CAM: '45',
MOVIE_BOLLYWOOD: '46',
TV_SD: '5',
TV_HD: '41',
TV_PACKS: '6',
TV_SPORT: '7',
DOCUMENTARIES: '9'
};
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
const proxyList = config.proxyList || defaultProxies;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrent/${torrentId}`)))
.then((body) => parseTorrentPage(body))
.then((torrent) => ({ torrentId, ...torrent }))
.catch((err) => torrent(torrentId, config, retries - 1));
}
function search(keyword, config = {}, retries = 2) {
if (!keyword || retries === 0) {
return Promise.reject(new Error(`Failed ${keyword} search`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}&search=${keyword}`)))
.then((body) => parseTableBody(body))
.catch(() => search(keyword, config, retries - 1));
}
function browse(config = {}, retries = 2, error = null) {
if (retries === 0) {
return Promise.reject(error || new Error(`Failed browse request`));
}
const proxyList = config.proxyList || defaultProxies;
const page = config.page || 1;
const category = config.category;
return Promises.first(proxyList
.map((proxyUrl) => singleRequest(`${proxyUrl}/torrents.php?cat=${category}&page=${page - 1}`)))
.then((body) => parseTableBody(body))
.catch((err) => browse(config, retries - 1, err));
}
function singleRequest(requestUrl) {
const options = { userAgent: getRandomUserAgent(), open_timeout: defaultTimeout, follow: 2 };
return needle('get', requestUrl, options)
.then((response) => {
const body = response.body;
if (!body) {
throw new Error(`No body: ${requestUrl} with status ${response.statusCode}`);
} else if (body.includes('Access Denied')) {
console.log(`Access Denied: ${requestUrl}`);
throw new Error(`Access Denied: ${requestUrl}`);
} else if (body.includes('502: Bad gateway') ||
body.includes('403 Forbidden') ||
body.includes('Origin DNS error')) {
throw new Error(`Invalid body contents: ${requestUrl}`);
}
return body;
});
}
function parseTableBody(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const torrents = [];
$('.tgxtable > div').each((i, element) => {
if (i === 0) return;
const row = $(element);
const magnetLink = row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(2)').attr('href');
const imdbIdMatch = row.html().match(/search=(tt\d+)/i);
try {
torrents.push({
name: row.find('.tgxtablecell div a[title]').first().text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
torrentLink: row.find('div:nth-of-type(n+2) .collapsehide > a:nth-of-type(1)').first().attr('href'),
torrentId: row.find('.tgxtablecell div a[title]').first().attr('href').match(/torrent\/(\d+)/)[1],
verified: !!row.find('i.fa-check').length,
category: row.find('div:nth-of-type(n+2) .shrink a').first().attr('href').match(/cat=(\d+)$/)[1],
seeders: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'green\'] b').first().text()),
leechers: parseInt(row.find('div:nth-of-type(n+2) .collapsehide [color=\'#ff0000\'] b').first().text()),
languages: row.find('.tgxtablecell img[title]').first().attr('title'),
size: parseSize(row.find('.collapsehide span.badge-secondary').first().text()),
uploadDate: parseDate(row.find('div.collapsehide:nth-of-type(12)').first().text()),
imdbId: imdbIdMatch && imdbIdMatch[1],
});
} catch (e) {
console.error('Failed parsing TorrentGalaxy row: ', e);
}
});
resolve(torrents);
});
}
function parseTorrentPage(body) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(body);
if (!$) {
reject(new Error('Failed loading body'));
}
const content = $('div[class="torrentpagetable limitwidth"]').first();
const magnetLink = $('a[class="btn btn-danger"]').attr('href');
const imdbIdContent = $('a[title="IMDB link"]').attr('href');
const imdbIdMatch = imdbIdContent && imdbIdContent.match(/imdb\.com\/title\/(tt\d+)/i);
const torrent = {
name: content.find('.linebreakup a').first().text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
verified: !content.find('i.fa-exclamation-triangle').length,
torrentLink: $('a[class="btn btn-success"]').attr('href'),
seeders: parseInt(content.find('font[color=\'green\']').first().text(), 10),
category: content.find('div:nth-of-type(4) a:nth-of-type(2)').first().attr('href').match(/cat=(\d+)$/)[1],
languages: content.find('div:nth-of-type(5) div:nth-of-type(2)').first().text().trim(),
size: parseSize(content.find('div:nth-of-type(6) div:nth-of-type(2)').first().text()),
uploadDate: parseDate(content.find('div:nth-of-type(9) div:nth-of-type(2)').first().text()),
imdbId: imdbIdMatch && imdbIdMatch[1],
};
resolve(torrent);
});
}
function parseSize(sizeText) {
if (!sizeText) {
return undefined;
}
let scale = 1;
if (sizeText.includes('GB')) {
scale = 1024 * 1024 * 1024
} else if (sizeText.includes('MB')) {
scale = 1024 * 1024;
} else if (sizeText.includes('KB') || sizeText.includes('kB')) {
scale = 1024;
}
return Math.floor(parseFloat(sizeText.replace(/[',]/g, '')) * scale);
}
function parseDate(dateString) {
if (dateString.includes('ago')) {
const amount = parseInt(dateString, 10);
const unit = dateString.includes('Min') ? 'minutes' : 'hours';
return moment().subtract(amount, unit).toDate();
}
const preparedDate = dateString.replace(/\//g, '-').replace(/-(\d{2})\s/, '-20$1 ')
return moment(preparedDate, 'DD-MM-YYYY HH:mm').toDate();
}
module.exports = { torrent, search, browse, Categories };

View File

@@ -0,0 +1,132 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const torrentGalaxy = require('./torrentgalaxy_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const Promises = require('../../lib/promises');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'TorrentGalaxy';
const TYPE_MAPPING = typeMapping();
const api_limiter = new Bottleneck({ maxConcurrent: 1, minTime: 5000 });
const limiter = new Bottleneck({ maxConcurrent: 10 });
const allowedCategories = [
torrentGalaxy.Categories.ANIME,
torrentGalaxy.Categories.MOVIE_4K,
torrentGalaxy.Categories.MOVIE_PACKS,
torrentGalaxy.Categories.MOVIE_SD,
torrentGalaxy.Categories.MOVIE_HD,
torrentGalaxy.Categories.MOVIE_CAM,
torrentGalaxy.Categories.MOVIE_BOLLYWOOD,
torrentGalaxy.Categories.TV_SD,
torrentGalaxy.Categories.TV_HD,
torrentGalaxy.Categories.TV_PACKS,
torrentGalaxy.Categories.DOCUMENTARIES,
];
const packCategories = [
torrentGalaxy.Categories.MOVIE_PACKS,
torrentGalaxy.Categories.TV_PACKS
];
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
// const ids = ['14212584'];
// return Promise.all(ids.map(id => limiter.schedule(() => torrentGalaxy.torrent(id)
// .then(torrent => processTorrentRecord(torrent)))))
// .then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => torrentGalaxy.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
return Promises.sequence(allowedCategories.map(category => () => scrapeLatestTorrentsForCategory(category)))
.then(entries => entries.reduce((a, b) => a.concat(b), []));
}
async function scrapeLatestTorrentsForCategory(category, page = 1) {
console.log(`Scrapping ${NAME} ${category} category page ${page}`);
return api_limiter.schedule(() => torrentGalaxy.browse({ category, page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] ${category} due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < getMaxPage(category)
? scrapeLatestTorrentsForCategory(category, page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (!record || !TYPE_MAPPING[record.category] || !record.verified) {
return Promise.resolve('Invalid torrent record');
}
const torrent = {
provider: NAME,
infoHash: record.infoHash,
torrentId: record.torrentId,
torrentLink: record.torrentLink,
title: record.name.replace(/\t|\s+/g, ' '),
type: TYPE_MAPPING[record.category],
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
imdbId: record.imdbId,
pack: packCategories.includes(record.category),
languages: !(record.languages || '').includes('Other') ? record.languages : undefined
};
if (await checkAndUpdateTorrent(torrent)) {
return torrent;
}
const isOld = moment(torrent.uploadDate).isBefore(moment().subtract(18, 'month'));
if (torrent.seeders === 0 && isOld && !torrent.pack) {
console.log(`Skipping old unseeded torrent [${torrent.infoHash}] ${torrent.title}`)
return torrent;
}
return createTorrentEntry(torrent).then(() => torrent);
}
function typeMapping() {
const mapping = {};
mapping[torrentGalaxy.Categories.MOVIE_SD] = Type.MOVIE;
mapping[torrentGalaxy.Categories.MOVIE_HD] = Type.MOVIE;
mapping[torrentGalaxy.Categories.MOVIE_4K] = Type.MOVIE;
mapping[torrentGalaxy.Categories.MOVIE_CAM] = Type.MOVIE;
mapping[torrentGalaxy.Categories.MOVIE_PACKS] = Type.MOVIE;
mapping[torrentGalaxy.Categories.MOVIE_BOLLYWOOD] = Type.MOVIE;
mapping[torrentGalaxy.Categories.DOCUMENTARIES] = Type.MOVIE;
mapping[torrentGalaxy.Categories.TV_SD] = Type.SERIES;
mapping[torrentGalaxy.Categories.TV_HD] = Type.SERIES;
mapping[torrentGalaxy.Categories.TV_PACKS] = Type.SERIES;
mapping[torrentGalaxy.Categories.TV_SPORT] = Type.SERIES;
mapping[torrentGalaxy.Categories.ANIME] = Type.ANIME;
return mapping;
}
function getMaxPage(category) {
switch (category) {
case torrentGalaxy.Categories.TV_SD:
case torrentGalaxy.Categories.TV_HD:
case torrentGalaxy.Categories.MOVIE_SD:
case torrentGalaxy.Categories.MOVIE_HD:
return 5;
default:
return 1;
}
}
module.exports = { scrape, updateSeeders, NAME };

View File

@@ -0,0 +1,92 @@
const needle = require('needle');
const Promises = require('../../lib/promises');
const { getRandomUserAgent } = require('./../../lib/requestHelper');
const defaultProxies = [
'https://yts.mx'
];
const defaultTimeout = 30000;
const limit = 50;
function torrent(torrentId, config = {}, retries = 2) {
if (!torrentId || retries === 0) {
return Promise.reject(new Error(`Failed ${torrentId} search`));
}
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/movie_details.json?movie_id=${torrentId}`, config)))
.then(body => parseResults(body))
.catch(error => torrent(torrentId, config, retries - 1));
}
function search(query, config = {}, retries = 2) {
if (!query || retries === 0) {
return Promise.reject(new Error(`Failed ${query} search`));
}
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/list_movies.json?limit=${limit}&query_term=${query}`, config)))
.then(results => parseResults(results))
.catch(error => search(query, config, retries - 1));
}
function browse(config = {}, retries = 2) {
if (retries === 0) {
return Promise.reject(new Error(`Failed browse request`));
}
const page = config.page || 1;
return Promises.first(defaultProxies
.map(proxyUrl => singleRequest(`${proxyUrl}/api/v2/list_movies.json?limit=${limit}&page=${page}`, config)))
.then(results => parseResults(results))
.catch(error => browse(config, retries - 1));
}
function singleRequest(requestUrl, config = {}) {
const timeout = config.timeout || defaultTimeout;
const options = { userAgent: getRandomUserAgent(), open_timeout: timeout, follow: 2 };
return needle('get', requestUrl, options)
.then(response => {
if (!response.body) {
return Promise.reject(`No body: ${requestUrl}`);
}
return Promise.resolve(response.body);
});
}
function parseResults(results) {
if (!results || !results.data || (!results.data.movie && !Array.isArray(results.data.movies))) {
console.log('Incorrect results: ', results);
return Promise.reject('Incorrect results')
}
return (results.data.movies || [results.data.movie])
.filter(movie => Array.isArray(movie.torrents))
.map(movie => parseMovie(movie))
.reduce((a, b) => a.concat(b), []);
}
function parseMovie(movie) {
return movie.torrents.map(torrent => ({
name: `${movie.title} ${movie.year} ${torrent.quality} ${formatType(torrent.type)} `,
torrentId: `${movie.id}-${torrent.hash.trim().toLowerCase()}`,
infoHash: torrent.hash.trim().toLowerCase(),
torrentLink: torrent.url,
seeders: torrent.seeds,
size: torrent.size_bytes,
uploadDate: new Date(torrent.date_uploaded_unix * 1000),
imdbId: movie.imdb_code
}));
}
function formatType(type) {
if (type === 'web') {
return 'WEBRip';
}
if (type === 'bluray') {
return 'BluRay';
}
return type.toUpperCase();
}
module.exports = { torrent, search, browse };

View File

@@ -0,0 +1,71 @@
const moment = require('moment');
const Bottleneck = require('bottleneck');
const yts = require('./yts_api');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const { createTorrentEntry, checkAndUpdateTorrent } = require('../../lib/torrentEntries');
const NAME = 'YTS';
const UNTIL_PAGE = 2;
const limiter = new Bottleneck({ maxConcurrent: 10 });
async function scrape() {
const scrapeStart = moment();
const lastScrape = await repository.getProvider({ name: NAME });
console.log(`[${scrapeStart}] starting ${NAME} scrape...`);
return scrapeLatestTorrents()
.then(() => {
lastScrape.lastScraped = scrapeStart;
return lastScrape.save();
})
.then(() => console.log(`[${moment()}] finished ${NAME} scrape`));
}
async function updateSeeders(torrent) {
return limiter.schedule(() => yts.torrent(torrent.torrentId));
}
async function scrapeLatestTorrents() {
return scrapeLatestTorrentsForCategory();
}
async function scrapeLatestTorrentsForCategory(page = 1) {
console.log(`Scrapping ${NAME} page ${page}`);
return yts.browse(({ page }))
.catch(error => {
console.warn(`Failed ${NAME} scrapping for [${page}] due: `, error);
return Promise.resolve([]);
})
.then(torrents => Promise.all(torrents.map(torrent => limiter.schedule(() => processTorrentRecord(torrent)))))
.then(resolved => resolved.length > 0 && page < UNTIL_PAGE
? scrapeLatestTorrentsForCategory(page + 1)
: Promise.resolve());
}
async function processTorrentRecord(record) {
if (await checkAndUpdateTorrent(record)) {
return record;
}
if (!record || !record.size) {
return Promise.resolve('Invalid torrent record');
}
const torrent = {
infoHash: record.infoHash,
provider: NAME,
torrentId: record.torrentId,
title: record.name.replace(/\t|\s+/g, ' ').trim(),
type: Type.MOVIE,
size: record.size,
seeders: record.seeders,
uploadDate: record.uploadDate,
imdbId: record.imdbId,
};
return createTorrentEntry(torrent).then(() => torrent);
}
module.exports = { scrape, updateSeeders, NAME };