Added back original scrapers, integrated with PGSQL

This commit is contained in:
Gabisonfire
2024-01-17 16:20:00 -05:00
parent 364d1545b1
commit cab7f38c66
79 changed files with 65832 additions and 1702 deletions

View File

@@ -1,4 +0,0 @@
*.lock
*/scrape-cache
Dockerfile
docker-compose.yml

View File

@@ -1,12 +0,0 @@
FROM python:3.10.13
COPY . .
RUN pip --no-cache-dir install pipenv && \
apt-get update && apt-get install sqlite3 && \
apt-get clean autoclean && \
apt-get autoremove --yes && \
mkdir /sqlite && \
sqlite3 /sqlite/torrentio.sqlite "VACUUM;" && \
cat torrentio.sql | sqlite3 /sqlite/torrentio.sqlite && \
ls -l /sqlite && \
pipenv install && touch .init
CMD [ "pipenv", "run", "python", "-u", "scraper.py" ]

View File

@@ -1,16 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
requests = "*"
beautifulsoup4 = "*"
imdbpy = "*"
apscheduler = "*"
[dev-packages]
[requires]
python_version = "3.10"
python_full_version = "3.10.13"

View File

@@ -1,15 +0,0 @@
{
"1337x": {
"sleep": 10,
"urls_to_scrape": [
"https://1337x.to/trending/w/movies/",
"https://1337x.to/trending/d/movies/"
],
"scrape_interval": {
"days": 0,
"hours" : 1,
"minutes": 0,
"seconds": 0
}
}
}

View File

@@ -2,30 +2,43 @@ version: '2'
services:
mongodb:
restart: always
image: docker.io/bitnami/mongodb:7.0
ports:
- "27017:27017"
volumes:
- 'mongodb_data:/bitnami/mongodb'
scraper:
build: ./
build: ../../scraper/
restart: always
volumes:
- 'sqlite_data:/sqlite'
ports:
- "7001:7001"
environment:
- PORT=7001
- MONGODB_URI=mongodb://mongodb:27017/torrentio
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
torrentio:
build: ../
volumes:
- 'sqlite_data:/sqlite'
restart: always
ports:
- "7000:7000"
links:
- "mongodb:mongodb"
environment:
- MONGODB_URI=mongodb://mongodb:27017/torrentio
- DATABASE_URI=sqlite:/sqlite/torrentio.sqlite
- DATABASE_URI=postgres://postgres@postgres:5432/torrentio
postgres:
image: postgres:14-alpine
ports:
- 5432:5432
volumes:
- pg_data:/var/lib/postgresql/data
- './init:/docker-entrypoint-initdb.d'
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
- POSTGRES_USER=postgres
- POSTGRES_DB=torrentio
volumes:
mongodb_data:
driver: local
sqlite_data:
pg_data:
driver: local

View File

@@ -0,0 +1,53 @@
CREATE TABLE "torrents" (
"infoHash" varchar(64) PRIMARY KEY,
"provider" varchar(32) NOT NULL,
"torrentId" varchar(128),
"title" varchar(256) NOT NULL,
"size" bigint,
"type" varchar(16) NOT NULL,
"uploadDate" date NOT NULL,
"seeders" integer,
"trackers" varchar(4096),
"languages" varchar(4096),
"resolution" varchar(16),
"createdAt" date,
"updatedAt" date,
"reviewed" boolean,
"opened" boolean
);
CREATE TABLE "files" (
"id" SERIAL PRIMARY KEY,
"infoHash" varchar(64) NOT NULL,
"fileIndex" integer,
"title" varchar(256) NOT NULL,
"size" bigint,
"imdbId" varchar(32),
"imdbSeason" integer,
"imdbEpisode" integer,
"kitsuId" integer,
"kitsuEpisode" integer,
"createdAt" date,
"updatedAt" date,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
UNIQUE("infoHash","fileIndex")
);
CREATE TABLE "subtitles" (
"id" SERIAL PRIMARY KEY,
"infoHash" varchar(64) NOT NULL,
"fileIndex" integer NOT NULL,
"fileId" bigint,
"title" varchar(512) NOT NULL,
"size" bigint,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
FOREIGN KEY("fileId") REFERENCES files("id") ON DELETE SET NULL
);
CREATE TABLE "contents" (
"infoHash" varchar(64),
"fileIndex" integer,
"path" varchar(256),
"size" bigint,
FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE
);

View File

@@ -1,141 +0,0 @@
import os
import re
import math
import requests
import time
from imdb import Cinemagoer
from bs4 import BeautifulSoup
from shared import imdb_find, build_and_write, extract_title, read_config
TORRENT_CACHES = ('http://itorrents.org', 'http://torrage.info', 'http://btcache.me')
BASE_URL = 'https://1337x.to'
MOVIE_BASE = 'https://1337x.to/movie-library'
MOVIE_LIBRARY_MAX_PAGE = 301
CACHE_DIR = "./scrape-cache"
PROVIDER = "1337x"
SLEEP_BETWEEN_REQUESTS = read_config(PROVIDER, "sleep")
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
def get_links_and_process(url):
links = []
print(f"Requesting movies from: {url}")
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
main = req.text
soup = BeautifulSoup(main, "html.parser")
for a in soup.find_all("a"):
if a.get("href").startswith("/torrent/"):
links.append((a.get("href"), extract_title(a.text)))
process_links(links)
def get_links_initial():
links = []
for i in range(1,MOVIE_LIBRARY_MAX_PAGE + 1):
try:
print(f"Sleeping {SLEEP_BETWEEN_REQUESTS}")
time.sleep(SLEEP_BETWEEN_REQUESTS)
main = ""
if os.path.isfile(f"{CACHE_DIR}/main-{i}.html"):
print(f"Reading main page({i}) from cache...")
main = open(f"{CACHE_DIR}/main-{i}.html", "r").read()
else:
print(f"Requesting main index: {MOVIE_BASE}/{i}/")
req = requests.get(f"{MOVIE_BASE}/{i}/", headers={'User-Agent': 'Mozilla/5.0'})
if req.status_code == 404:
print(f"Page does not exist: {MOVIE_BASE}/{i}/. Breaking loop.")
break
main = req.text
open(f"{CACHE_DIR}/main-{i}.html", "w+").write(main)
movies = []
soup = BeautifulSoup(main, "html.parser")
for h3 in soup.find_all("h3"):
a = h3.findChildren("a", href=True)[0]
movie_link = a.get("href")
movie_title = a.text
movies.append((movie_title, movie_link))
for movie in movies:
if os.path.isfile(f"{CACHE_DIR}{movie[1]}html.html"):
print(f"Reading movie page({movie[0]}) from cache...")
main = open(f"{CACHE_DIR}{movie[1]}html.html").read()
else:
print(f"Requesting movie releases: {BASE_URL}{movie[1]}")
req = requests.get(f"{BASE_URL}{movie[1]}", headers={'User-Agent': 'Mozilla/5.0'})
main = req.text
if not os.path.exists(f"{CACHE_DIR}{movie[1]}"):
os.makedirs(f"{CACHE_DIR}{movie[1]}")
open(f"{CACHE_DIR}{movie[1]}html.html", "w+").write(main)
soup = BeautifulSoup(main, "html.parser")
for href in soup.find_all("a"):
if href.get("href").startswith("/torrent/"):
links.append((href.get("href"), movie[0]))
except Exception as e:
print(e)
return links
def process_links(links):
print(f"Checking links...({len(links)})")
counter = 1
for link in links:
try:
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
torrent_html = req.text
t = {}
soup = BeautifulSoup(torrent_html, "html.parser")
t['title'] = soup.find("h1").text.strip()
t['size'] = 0
t['magnets'] = []
t['torrents'] = []
all_a = soup.find_all("a")
for a in all_a:
if a.get("href").startswith("https://www.imdb.com/title"):
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
if a.get("href").startswith("magnet:"):
t['magnets'].append(a.get("href"))
if a.get("href").startswith(TORRENT_CACHES):
t['torrents'].append(a.get("href"))
all_li = soup.find_all("li")
for li in all_li:
if "Total size" in li.text:
size = li.findChildren("span")[0].text
mb = False
if "MB" in size: mb = True
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
if mb:
t['size'] = math.trunc(float(size) * 107374182)
else:
t['size'] = math.trunc(float(size) * 1073741824)
t['seeders'] = soup.find("span", {"class": "seeds"}).text
all_p = soup.find_all("p")
for p in all_p:
if "Infohash :" in p.text:
t['infoHash'] = p.findChildren("span")[0].text.lower()
t['files'] = []
file_div = soup.find("div", {"id":"files"})
for li in file_div.findChildren("li"):
f = re.sub('\s\(.*\)', '', li.text)
t["files"].append(f)
t['trackers'] = []
tracker_div = soup.find("div", {"id":"tracker-list"})
for tracker in tracker_div.findChildren("li"):
t['trackers'].append(tracker.text.strip())
if not 'imdbid' in t or t['imdbid'] == '':
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
if found is not None:
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
else:
new_id = imdb_find(link[1])
if new_id is not None:
t['imdbid'] = f"tt{new_id}"
else:
print(f"{t['title']} has no IMDB Id")
continue
build_and_write(t)
except:
counter += 1
continue
counter += 1

View File

@@ -1,41 +0,0 @@
import os
import threading
from p1337x import process_links, get_links_initial, get_links_and_process
from apscheduler.schedulers.background import BackgroundScheduler
from shared import read_config
print("Scaper starting...")
if os.path.isfile(".init"):
print("Found init file, executing initial sync. Be patient.")
process_links(get_links_initial())
os.remove(".init")
sched = BackgroundScheduler(timezone="America/New_York")
sched.start()
# 1337x
PROVIDER = "1337x"
pages = read_config(PROVIDER, "urls_to_scrape")
interval = read_config(PROVIDER, "scrape_interval")
for page in pages:
j = sched.add_job(
get_links_and_process,
'interval',
days=interval["days"],
hours=interval["hours"],
minutes=interval["minutes"],
seconds=interval["seconds"],
id=page,
args=[page],
max_instances=1)
print(f"{page} willl be scraped {j.next_run_time}.")
# Wait forever
main_thread = threading.main_thread()
while True:
L = threading.enumerate()
L.remove(main_thread) # or avoid it in the for loop
for t in L:
t.join()

View File

@@ -1,73 +0,0 @@
import sqlite3
import re
import os
import json
from imdb import Cinemagoer
SQLITE_PATH = "/sqlite/torrentio.sqlite"
ia = Cinemagoer()
CONFIG = "config.json"
def read_config(provider, key):
if os.path.isfile(CONFIG):
f = open(CONFIG, "r")
cfg = json.load(f)
return cfg[provider][key]
def filter_file(file):
allowed_ext = ['.mp4', '.mkv', '.avi', '.mpeg', '.mpg', '.mpv', '.mov']
if os.path.splitext(file)[1] in allowed_ext:
return True
return False
def create_connection(db_file):
conn = None
try:
conn = sqlite3.connect(db_file, check_same_thread=False)
except Exception as e:
print(e)
exit(1)
return conn
sqlite = create_connection(SQLITE_PATH)
def build_and_write(torrent):
try:
print(f"Recording {torrent['title']} in the database")
q = f"INSERT OR REPLACE INTO torrents (infoHash, provider, title, size, type, uploadDate, seeders, trackers) VALUES (?,?,?,?,?,?,?,?)"
p = (torrent['infoHash'],'1337x',torrent['title'],torrent['size'],'movie','1/1/2024',torrent['seeders'],','.join(torrent['trackers']))
cursor = sqlite.cursor()
cursor.execute(q,p)
for file in torrent['files']:
if filter_file(file):
q = f"INSERT OR REPLACE INTO files (infoHash, fileIndex, title, size, imdbId) VALUES (?,?,?,?,?)"
p = (torrent['infoHash'], torrent['files'].index(file), file, torrent['size'], torrent['imdbid'])
cursor.execute(q,p)
sqlite.commit()
cursor.close()
except sqlite3.Error as error:
print(error)
def imdb_find(name):
movie = ia.search_movie(name)
if len(movie) >= 1:
return movie[0].movieID
return None
def extract_title(filename):
try:
filename.strip()
filename = filename.replace('.', ' ')
res = re.search('([^\\\]+)\.(avi|mkv|mpeg|mpg|mov|mp4)$', filename)
if res:
filename = res.group(1)
res = re.search('(.*?)(dvdrip|xvid| cd[0-9]|dvdscr|brrip|divx|[\{\(\[]?[0-9]{4}).*', filename)
if res:
filename = res.group(1)
res = re.search('(.*?)\(.*\)(.*)', filename)
if res:
filename = res.group(1)
return filename
except:
return ""

View File

@@ -1,44 +0,0 @@
CREATE TABLE "torrents" (
"infoHash" TEXT,
"provider" TEXT NOT NULL,
"torrentId" TEXT,
"title" TEXT NOT NULL,
"size" INTEGER,
"type" TEXT NOT NULL,
"uploadDate" TEXT NOT NULL,
"seeders" INTEGER,
"trackers" TEXT,
"languages" TEXT,
"resolution" TEXT,
"createdAt" TEXT,
"updatedAt" TEXT,
PRIMARY KEY("infoHash")
);
CREATE TABLE "files" (
"id" INTEGER,
"infoHash" TEXT NOT NULL,
"fileIndex" TEXT,
"title" INTEGER,
"size" INTEGER,
"imdbId" TEXT,
"imdbSeason" INTEGER,
"imdbEpisode" INTEGER,
"kitsuId" INTEGER,
"kitsuEpisode" INTEGER,
"createdAt" TEXT,
"updatedAt" TEXT,
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE,
PRIMARY KEY("id" AUTOINCREMENT)
UNIQUE(infoHash, fileIndex)
);
CREATE TABLE "subtitles" (
"infoHash" TEXT NOT NULL,
"fileIndex" INTEGER NOT NULL,
"fileId" INTEGER,
"title" TEXT NOT NULL,
"size" INTEGER NOT NULL,
FOREIGN KEY("fileId") REFERENCES "file"("id") ON DELETE SET NULL
FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE
);