First self-hosted release

2024-12-20 03:29:51 +00:00 · 2024-01-17 00:27:03 -05:00
parent 94ec4a7a52
commit a34e40fbc8
16 changed files with 1742 additions and 95 deletions
--- a/addon/hosted/p1337x.py
+++ b/addon/hosted/p1337x.py
@@ -0,0 +1,137 @@
+import os
+import re
+import math
+import requests
+import time
+from imdb import Cinemagoer
+from bs4 import BeautifulSoup
+from shared import imdb_find, build_and_write, extract_title, read_config
+
+TORRENT_CACHES = ('http://itorrents.org', 'http://torrage.info', 'http://btcache.me')
+BASE_URL = 'https://1337x.to'
+MOVIE_BASE = 'https://1337x.to/movie-library'
+MOVIE_LIBRARY_MAX_PAGE = 301
+CACHE_DIR = "./scrape-cache"
+PROVIDER = "1337x"
+SLEEP_BETWEEN_REQUESTS = read_config(PROVIDER, "sleep")
+
+if not os.path.exists(CACHE_DIR):
+    os.makedirs(CACHE_DIR)
+
+def get_links_and_process(url):
+    links = []
+    print(f"Requesting movies from: {url}")
+    req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
+    main = req.text
+    soup = BeautifulSoup(main, "html.parser")
+    for a in soup.find_all("a"):
+        if a.get("href").startswith("/torrent/"):
+            links.append((a.get("href"), extract_title(a.text)))
+    process_links(links)
+
+def get_links_initial():
+    links = []
+    for i in range(1,MOVIE_LIBRARY_MAX_PAGE + 1):
+        try:
+            print(f"Sleeping {SLEEP_BETWEEN_REQUESTS}")
+            time.sleep(SLEEP_BETWEEN_REQUESTS)
+
+            main = ""
+            if os.path.isfile(f"{CACHE_DIR}/main-{i}.html"):
+                print(f"Reading main page({i}) from cache...")
+                main = open(f"{CACHE_DIR}/main-{i}.html", "r").read()
+            else:
+                print(f"Requesting main index: {MOVIE_BASE}/{i}/")
+                req = requests.get(f"{MOVIE_BASE}/{i}/", headers={'User-Agent': 'Mozilla/5.0'})
+                if req.status_code == 404:
+                    print(f"Page does not exist: {MOVIE_BASE}/{i}/. Breaking loop.")
+                    break
+                main = req.text
+                open(f"{CACHE_DIR}/main-{i}.html", "w+").write(main)
+
+            movies = []
+            soup = BeautifulSoup(main, "html.parser")
+            for h3 in soup.find_all("h3"):
+                a = h3.findChildren("a", href=True)[0]
+                movie_link = a.get("href")
+                movie_title = a.text
+                movies.append((movie_title, movie_link))
+
+            for movie in movies:
+                if os.path.isfile(f"{CACHE_DIR}{movie[1]}html.html"):
+                    print(f"Reading movie page({movie[0]}) from cache...")
+                    main = open(f"{CACHE_DIR}{movie[1]}html.html").read()
+                else:
+                    print(f"Requesting movie releases: {BASE_URL}{movie[1]}")
+                    req = requests.get(f"{BASE_URL}{movie[1]}", headers={'User-Agent': 'Mozilla/5.0'})
+                    main = req.text
+                if not os.path.exists(f"{CACHE_DIR}{movie[1]}"):
+                    os.makedirs(f"{CACHE_DIR}{movie[1]}")
+                open(f"{CACHE_DIR}{movie[1]}html.html", "w+").write(main)
+                soup = BeautifulSoup(main, "html.parser")
+                for href in soup.find_all("a"):
+                    if href.get("href").startswith("/torrent/"):
+                        links.append((href.get("href"), movie[0]))
+        except Exception as e:
+            print(e)
+    return links
+
+def process_links(links):
+    print(f"Checking links...({len(links)})")
+    counter = 1
+    for link in links:
+        print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
+        req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
+        torrent_html = req.text
+        t = {}
+        soup = BeautifulSoup(torrent_html, "html.parser")
+        t['title'] = soup.find("h1").text.strip()
+        t['size'] = 0
+        t['magnets'] = []
+        t['torrents'] = []
+        all_a = soup.find_all("a")
+        for a in all_a:
+            if a.get("href").startswith("https://www.imdb.com/title"):
+                t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
+            if a.get("href").startswith("magnet:"):
+                t['magnets'].append(a.get("href"))
+            if a.get("href").startswith(TORRENT_CACHES):
+                t['torrents'].append(a.get("href"))
+        all_li = soup.find_all("li")
+        for li in all_li:
+            if "Total size" in li.text:
+                size = li.findChildren("span")[0].text
+                mb = False
+                if "MB" in size: mb = True
+                size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
+                if mb:
+                    t['size'] = math.trunc(float(size) * 107374182)
+                else:
+                    t['size'] = math.trunc(float(size) * 1073741824)
+        t['seeders'] = soup.find("span", {"class": "seeds"}).text
+        all_p = soup.find_all("p")
+        for p in all_p:
+            if "Infohash :" in p.text:
+                t['infoHash'] = p.findChildren("span")[0].text.lower()
+        t['files'] = []
+        file_div = soup.find("div", {"id":"files"})
+        for li in file_div.findChildren("li"):
+            f = re.sub('\s\(.*\)', '', li.text)
+            t["files"].append(f)
+        t['trackers'] = []
+        tracker_div = soup.find("div", {"id":"tracker-list"})
+        for tracker in tracker_div.findChildren("li"):
+            t['trackers'].append(tracker.text.strip())
+        if not 'imdbid' in t or t['imdbid'] == '':
+            found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
+            if found is not None:
+                t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
+            else:
+                new_id = imdb_find(link[1])
+                if new_id is not None:
+                    t['imdbid'] = f"tt{new_id}"
+                else:
+                    print(f"{t['title']} has no IMDB Id")
+                    continue
+        build_and_write(t)
+        counter += 1