Added back original scrapers, integrated with PGSQL

2024-12-20 03:29:51 +00:00 · 2024-01-17 16:20:00 -05:00
parent 364d1545b1
commit cab7f38c66
79 changed files with 65832 additions and 1702 deletions
--- a/addon/hosted/.dockerignore
+++ b/addon/hosted/.dockerignore
@@ -1,4 +0,0 @@
-*.lock
-*/scrape-cache
-Dockerfile
-docker-compose.yml
--- a/addon/hosted/Dockerfile
+++ b/addon/hosted/Dockerfile
@@ -1,12 +0,0 @@
-FROM python:3.10.13
-COPY . .
-RUN pip --no-cache-dir install pipenv && \
-    apt-get update && apt-get install sqlite3 && \
-    apt-get clean autoclean && \
-    apt-get autoremove --yes && \
-    mkdir /sqlite && \
-    sqlite3 /sqlite/torrentio.sqlite "VACUUM;" && \
-    cat torrentio.sql | sqlite3 /sqlite/torrentio.sqlite && \
-    ls -l /sqlite && \
-    pipenv install && touch .init
-CMD [ "pipenv", "run", "python", "-u", "scraper.py" ]
--- a/addon/hosted/Pipfile
+++ b/addon/hosted/Pipfile
@@ -1,16 +0,0 @@
-[[source]]
-url = "https://pypi.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-requests = "*"
-beautifulsoup4 = "*"
-imdbpy = "*"
-apscheduler = "*"
-
-[dev-packages]
-
-[requires]
-python_version = "3.10"
-python_full_version = "3.10.13"
--- a/addon/hosted/config.json
+++ b/addon/hosted/config.json
@@ -1,15 +0,0 @@
-{
-    "1337x": {
-        "sleep": 10,
-        "urls_to_scrape": [
-            "https://1337x.to/trending/w/movies/",
-            "https://1337x.to/trending/d/movies/"
-        ],
-        "scrape_interval": {
-            "days": 0,
-            "hours" : 1,
-            "minutes": 0,
-            "seconds": 0
-        }
-    }
-}
--- a/addon/hosted/docker-compose.yml
+++ b/addon/hosted/docker-compose.yml
@@ -2,30 +2,43 @@ version: '2'

 services:
  mongodb:
+    restart: always
    image: docker.io/bitnami/mongodb:7.0
    ports:
      - "27017:27017"
    volumes:
      - 'mongodb_data:/bitnami/mongodb'
  scraper:
-    build: ./
+    build: ../../scraper/ 
    restart: always
-    volumes:
-      - 'sqlite_data:/sqlite'
+    ports:
+      - "7001:7001"
+    environment:
+      - PORT=7001
+      - MONGODB_URI=mongodb://mongodb:27017/torrentio
+      - DATABASE_URI=postgres://postgres@postgres:5432/torrentio
  torrentio:
    build: ../
-    volumes:
-      - 'sqlite_data:/sqlite'
+    restart: always
    ports:
      - "7000:7000"
-    links:
-      - "mongodb:mongodb"
    environment:
      - MONGODB_URI=mongodb://mongodb:27017/torrentio
-      - DATABASE_URI=sqlite:/sqlite/torrentio.sqlite
+      - DATABASE_URI=postgres://postgres@postgres:5432/torrentio
+  postgres:
+    image: postgres:14-alpine
+    ports:
+      - 5432:5432
+    volumes:
+      - pg_data:/var/lib/postgresql/data
+      - './init:/docker-entrypoint-initdb.d'
+    environment:
+      - POSTGRES_HOST_AUTH_METHOD=trust
+      - POSTGRES_USER=postgres
+      - POSTGRES_DB=torrentio

 volumes:
  mongodb_data:
    driver: local
-  sqlite_data:
+  pg_data:
    driver: local
--- a/addon/hosted/init/torrentio-pgsql.sql
+++ b/addon/hosted/init/torrentio-pgsql.sql
@@ -0,0 +1,53 @@
+CREATE TABLE "torrents" (
+	"infoHash" 	varchar(64) PRIMARY KEY,
+	"provider"	varchar(32) NOT NULL,
+	"torrentId"	varchar(128),
+	"title"	varchar(256) NOT NULL,
+	"size"	bigint,
+	"type"	varchar(16) NOT NULL,
+	"uploadDate"	date NOT NULL,
+	"seeders"	integer,
+	"trackers"	varchar(4096),
+	"languages"	varchar(4096),
+	"resolution"	varchar(16),
+	"createdAt"	date,
+	"updatedAt"	date,
+	"reviewed"	boolean,
+	"opened"	boolean
+);
+
+CREATE TABLE "files" (
+	"id"	SERIAL PRIMARY KEY,
+	"infoHash"	varchar(64) NOT NULL,
+	"fileIndex"	integer,
+	"title"	varchar(256) NOT NULL,
+	"size"	bigint,
+	"imdbId"	varchar(32),
+	"imdbSeason"	integer,
+	"imdbEpisode"	integer,
+	"kitsuId"	integer,
+	"kitsuEpisode"	integer,
+	"createdAt"	date,
+	"updatedAt"	date,
+	FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
+	UNIQUE("infoHash","fileIndex")
+);
+
+CREATE TABLE "subtitles" (
+	"id"	SERIAL PRIMARY KEY,
+	"infoHash"	varchar(64) NOT NULL,
+	"fileIndex"	integer NOT NULL,
+	"fileId"	bigint,
+	"title"	varchar(512) NOT NULL,
+	"size"	bigint,
+	FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE,
+	FOREIGN KEY("fileId") REFERENCES files("id") ON DELETE SET NULL
+);
+
+CREATE TABLE "contents" (
+	"infoHash"	varchar(64),
+	"fileIndex"	integer,
+	"path"	varchar(256),
+	"size"	bigint,
+	FOREIGN KEY("infoHash") REFERENCES torrents("infoHash") ON DELETE CASCADE
+);
--- a/addon/hosted/p1337x.py
+++ b/addon/hosted/p1337x.py
@@ -1,141 +0,0 @@
-import os
-import re
-import math
-import requests
-import time
-from imdb import Cinemagoer
-from bs4 import BeautifulSoup
-from shared import imdb_find, build_and_write, extract_title, read_config
-
-TORRENT_CACHES = ('http://itorrents.org', 'http://torrage.info', 'http://btcache.me')
-BASE_URL = 'https://1337x.to'
-MOVIE_BASE = 'https://1337x.to/movie-library'
-MOVIE_LIBRARY_MAX_PAGE = 301
-CACHE_DIR = "./scrape-cache"
-PROVIDER = "1337x"
-SLEEP_BETWEEN_REQUESTS = read_config(PROVIDER, "sleep")
-
-if not os.path.exists(CACHE_DIR):
-    os.makedirs(CACHE_DIR)
-
-def get_links_and_process(url):
-    links = []
-    print(f"Requesting movies from: {url}")
-    req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
-    main = req.text
-    soup = BeautifulSoup(main, "html.parser")
-    for a in soup.find_all("a"):
-        if a.get("href").startswith("/torrent/"):
-            links.append((a.get("href"), extract_title(a.text)))
-    process_links(links)
-
-def get_links_initial():
-    links = []
-    for i in range(1,MOVIE_LIBRARY_MAX_PAGE + 1):
-        try:
-            print(f"Sleeping {SLEEP_BETWEEN_REQUESTS}")
-            time.sleep(SLEEP_BETWEEN_REQUESTS)
-
-            main = ""
-            if os.path.isfile(f"{CACHE_DIR}/main-{i}.html"):
-                print(f"Reading main page({i}) from cache...")
-                main = open(f"{CACHE_DIR}/main-{i}.html", "r").read()
-            else:
-                print(f"Requesting main index: {MOVIE_BASE}/{i}/")
-                req = requests.get(f"{MOVIE_BASE}/{i}/", headers={'User-Agent': 'Mozilla/5.0'})
-                if req.status_code == 404:
-                    print(f"Page does not exist: {MOVIE_BASE}/{i}/. Breaking loop.")
-                    break
-                main = req.text
-                open(f"{CACHE_DIR}/main-{i}.html", "w+").write(main)
-
-            movies = []
-            soup = BeautifulSoup(main, "html.parser")
-            for h3 in soup.find_all("h3"):
-                a = h3.findChildren("a", href=True)[0]
-                movie_link = a.get("href")
-                movie_title = a.text
-                movies.append((movie_title, movie_link))
-
-            for movie in movies:
-                if os.path.isfile(f"{CACHE_DIR}{movie[1]}html.html"):
-                    print(f"Reading movie page({movie[0]}) from cache...")
-                    main = open(f"{CACHE_DIR}{movie[1]}html.html").read()
-                else:
-                    print(f"Requesting movie releases: {BASE_URL}{movie[1]}")
-                    req = requests.get(f"{BASE_URL}{movie[1]}", headers={'User-Agent': 'Mozilla/5.0'})
-                    main = req.text
-                if not os.path.exists(f"{CACHE_DIR}{movie[1]}"):
-                    os.makedirs(f"{CACHE_DIR}{movie[1]}")
-                open(f"{CACHE_DIR}{movie[1]}html.html", "w+").write(main)
-                soup = BeautifulSoup(main, "html.parser")
-                for href in soup.find_all("a"):
-                    if href.get("href").startswith("/torrent/"):
-                        links.append((href.get("href"), movie[0]))
-        except Exception as e:
-            print(e)
-    return links
-
-def process_links(links):
-    print(f"Checking links...({len(links)})")
-    counter = 1
-    for link in links:
-        try:
-            print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
-            req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
-            torrent_html = req.text
-            t = {}
-            soup = BeautifulSoup(torrent_html, "html.parser")
-            t['title'] = soup.find("h1").text.strip()
-            t['size'] = 0
-            t['magnets'] = []
-            t['torrents'] = []
-            all_a = soup.find_all("a")
-            for a in all_a:
-                if a.get("href").startswith("https://www.imdb.com/title"):
-                    t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
-                if a.get("href").startswith("magnet:"):
-                    t['magnets'].append(a.get("href"))
-                if a.get("href").startswith(TORRENT_CACHES):
-                    t['torrents'].append(a.get("href"))
-            all_li = soup.find_all("li")
-            for li in all_li:
-                if "Total size" in li.text:
-                    size = li.findChildren("span")[0].text
-                    mb = False
-                    if "MB" in size: mb = True
-                    size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
-                    if mb:
-                        t['size'] = math.trunc(float(size) * 107374182)
-                    else:
-                        t['size'] = math.trunc(float(size) * 1073741824)
-            t['seeders'] = soup.find("span", {"class": "seeds"}).text
-            all_p = soup.find_all("p")
-            for p in all_p:
-                if "Infohash :" in p.text:
-                    t['infoHash'] = p.findChildren("span")[0].text.lower()
-            t['files'] = []
-            file_div = soup.find("div", {"id":"files"})
-            for li in file_div.findChildren("li"):
-                f = re.sub('\s\(.*\)', '', li.text)
-                t["files"].append(f)
-            t['trackers'] = []
-            tracker_div = soup.find("div", {"id":"tracker-list"})
-            for tracker in tracker_div.findChildren("li"):
-                t['trackers'].append(tracker.text.strip())
-            if not 'imdbid' in t or t['imdbid'] == '':
-                found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
-                if found is not None:
-                    t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
-                else:
-                    new_id = imdb_find(link[1])
-                    if new_id is not None:
-                        t['imdbid'] = f"tt{new_id}"
-                    else:
-                        print(f"{t['title']} has no IMDB Id")
-                        continue
-            build_and_write(t)
-        except:
-            counter += 1
-            continue
-        counter += 1
--- a/addon/hosted/scraper.py
+++ b/addon/hosted/scraper.py
@@ -1,41 +0,0 @@
-import os
-import threading
-from p1337x import process_links, get_links_initial, get_links_and_process
-from apscheduler.schedulers.background import BackgroundScheduler
-from shared import read_config
-
-print("Scaper starting...")
-
-if os.path.isfile(".init"):
-    print("Found init file, executing initial sync. Be patient.")
-    process_links(get_links_initial())
-    os.remove(".init")
-
-sched = BackgroundScheduler(timezone="America/New_York")
-sched.start()
-
-# 1337x
-PROVIDER = "1337x"
-pages = read_config(PROVIDER, "urls_to_scrape")
-interval = read_config(PROVIDER, "scrape_interval")
-for page in pages:
-    j = sched.add_job(
-        get_links_and_process, 
-        'interval', 
-        days=interval["days"],
-        hours=interval["hours"],
-        minutes=interval["minutes"],
-        seconds=interval["seconds"],
-        id=page,
-        args=[page],
-        max_instances=1)
-    print(f"{page} willl be scraped {j.next_run_time}.")
-
-
-# Wait forever
-main_thread = threading.main_thread()
-while True:
-    L = threading.enumerate()
-    L.remove(main_thread)  # or avoid it in the for loop
-    for t in L:
-        t.join()
--- a/addon/hosted/shared.py
+++ b/addon/hosted/shared.py
@@ -1,73 +0,0 @@
-import sqlite3
-import re
-import os
-import json
-from imdb import Cinemagoer
-
-SQLITE_PATH = "/sqlite/torrentio.sqlite"
-
-ia = Cinemagoer()
-CONFIG = "config.json"
-
-def read_config(provider, key):
-    if os.path.isfile(CONFIG):
-        f = open(CONFIG, "r")
-        cfg = json.load(f)
-        return cfg[provider][key]
-
-def filter_file(file):
-    allowed_ext = ['.mp4', '.mkv', '.avi', '.mpeg', '.mpg', '.mpv', '.mov']
-    if os.path.splitext(file)[1] in allowed_ext:
-        return True
-    return False
-
-def create_connection(db_file):
-    conn = None
-    try:
-        conn = sqlite3.connect(db_file, check_same_thread=False)
-    except Exception as e:
-        print(e)
-        exit(1)
-    return conn
-
-sqlite = create_connection(SQLITE_PATH)
-
-def build_and_write(torrent):
-    try:
-        print(f"Recording {torrent['title']} in the database")  
-        q = f"INSERT OR REPLACE INTO torrents (infoHash, provider, title, size, type, uploadDate, seeders, trackers) VALUES (?,?,?,?,?,?,?,?)"
-        p = (torrent['infoHash'],'1337x',torrent['title'],torrent['size'],'movie','1/1/2024',torrent['seeders'],','.join(torrent['trackers']))
-        cursor = sqlite.cursor()
-        cursor.execute(q,p)
-        for file in torrent['files']:
-            if filter_file(file):
-                q = f"INSERT OR REPLACE INTO files (infoHash, fileIndex, title, size, imdbId) VALUES (?,?,?,?,?)"
-                p = (torrent['infoHash'], torrent['files'].index(file), file, torrent['size'], torrent['imdbid'])
-                cursor.execute(q,p)
-        sqlite.commit()
-        cursor.close()
-    except sqlite3.Error as error:
-        print(error)
-
-def imdb_find(name):
-    movie = ia.search_movie(name)
-    if len(movie) >= 1:
-        return movie[0].movieID
-    return None
-
-def extract_title(filename):
-    try:
-        filename.strip()
-        filename = filename.replace('.', ' ')
-        res = re.search('([^\\\]+)\.(avi|mkv|mpeg|mpg|mov|mp4)$', filename)
-        if res:
-            filename = res.group(1)
-        res = re.search('(.*?)(dvdrip|xvid| cd[0-9]|dvdscr|brrip|divx|[\{\(\[]?[0-9]{4}).*', filename)
-        if res:
-            filename = res.group(1)
-        res = re.search('(.*?)\(.*\)(.*)', filename)
-        if res:
-            filename = res.group(1)
-        return filename
-    except:
-        return ""
--- a/addon/hosted/torrentio.sql
+++ b/addon/hosted/torrentio.sql
@@ -1,44 +0,0 @@
-CREATE TABLE "torrents" (
-	"infoHash"	TEXT,
-	"provider"	TEXT NOT NULL,
-	"torrentId"	TEXT,
-	"title"	TEXT NOT NULL,
-	"size"	INTEGER,
-	"type"	TEXT NOT NULL,
-	"uploadDate"	TEXT NOT NULL,
-	"seeders"	INTEGER,
-	"trackers"	TEXT,
-	"languages"	TEXT,
-	"resolution"	TEXT,
-	"createdAt" TEXT,
-	"updatedAt" TEXT,
-	PRIMARY KEY("infoHash")
-);
-
-CREATE TABLE "files" (
-	"id" INTEGER,
-	"infoHash" TEXT NOT NULL,
-	"fileIndex"	TEXT,
-	"title" INTEGER,
-	"size" INTEGER,
-	"imdbId" TEXT,
-	"imdbSeason" INTEGER,
-	"imdbEpisode" INTEGER,
-    "kitsuId" INTEGER,
-    "kitsuEpisode" INTEGER,
-	"createdAt" TEXT,
-	"updatedAt" TEXT,
-    FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE,
-	PRIMARY KEY("id" AUTOINCREMENT)
-	UNIQUE(infoHash, fileIndex)
-);
-
-CREATE TABLE "subtitles" (
-	"infoHash" TEXT NOT NULL,
-    "fileIndex" INTEGER NOT NULL,
-    "fileId" INTEGER,
-    "title" TEXT NOT NULL,
-    "size" INTEGER NOT NULL,
-    FOREIGN KEY("fileId") REFERENCES "file"("id") ON DELETE SET NULL
-    FOREIGN KEY("infoHash") REFERENCES "torrent"("infoHash") ON DELETE CASCADE
-);