mirror of
https://github.com/knightcrawler-stremio/knightcrawler.git
synced 2024-12-20 03:29:51 +00:00
Wrapped try except for link processing
This commit is contained in:
@@ -9,6 +9,7 @@ services:
|
|||||||
- 'mongodb_data:/bitnami/mongodb'
|
- 'mongodb_data:/bitnami/mongodb'
|
||||||
scraper:
|
scraper:
|
||||||
build: ./
|
build: ./
|
||||||
|
restart: always
|
||||||
volumes:
|
volumes:
|
||||||
- 'sqlite_data:/sqlite'
|
- 'sqlite_data:/sqlite'
|
||||||
torrentio:
|
torrentio:
|
||||||
|
|||||||
@@ -80,58 +80,62 @@ def process_links(links):
|
|||||||
print(f"Checking links...({len(links)})")
|
print(f"Checking links...({len(links)})")
|
||||||
counter = 1
|
counter = 1
|
||||||
for link in links:
|
for link in links:
|
||||||
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
|
try:
|
||||||
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
|
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
|
||||||
torrent_html = req.text
|
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
|
||||||
t = {}
|
torrent_html = req.text
|
||||||
soup = BeautifulSoup(torrent_html, "html.parser")
|
t = {}
|
||||||
t['title'] = soup.find("h1").text.strip()
|
soup = BeautifulSoup(torrent_html, "html.parser")
|
||||||
t['size'] = 0
|
t['title'] = soup.find("h1").text.strip()
|
||||||
t['magnets'] = []
|
t['size'] = 0
|
||||||
t['torrents'] = []
|
t['magnets'] = []
|
||||||
all_a = soup.find_all("a")
|
t['torrents'] = []
|
||||||
for a in all_a:
|
all_a = soup.find_all("a")
|
||||||
if a.get("href").startswith("https://www.imdb.com/title"):
|
for a in all_a:
|
||||||
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
|
if a.get("href").startswith("https://www.imdb.com/title"):
|
||||||
if a.get("href").startswith("magnet:"):
|
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
|
||||||
t['magnets'].append(a.get("href"))
|
if a.get("href").startswith("magnet:"):
|
||||||
if a.get("href").startswith(TORRENT_CACHES):
|
t['magnets'].append(a.get("href"))
|
||||||
t['torrents'].append(a.get("href"))
|
if a.get("href").startswith(TORRENT_CACHES):
|
||||||
all_li = soup.find_all("li")
|
t['torrents'].append(a.get("href"))
|
||||||
for li in all_li:
|
all_li = soup.find_all("li")
|
||||||
if "Total size" in li.text:
|
for li in all_li:
|
||||||
size = li.findChildren("span")[0].text
|
if "Total size" in li.text:
|
||||||
mb = False
|
size = li.findChildren("span")[0].text
|
||||||
if "MB" in size: mb = True
|
mb = False
|
||||||
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
|
if "MB" in size: mb = True
|
||||||
if mb:
|
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
|
||||||
t['size'] = math.trunc(float(size) * 107374182)
|
if mb:
|
||||||
|
t['size'] = math.trunc(float(size) * 107374182)
|
||||||
|
else:
|
||||||
|
t['size'] = math.trunc(float(size) * 1073741824)
|
||||||
|
t['seeders'] = soup.find("span", {"class": "seeds"}).text
|
||||||
|
all_p = soup.find_all("p")
|
||||||
|
for p in all_p:
|
||||||
|
if "Infohash :" in p.text:
|
||||||
|
t['infoHash'] = p.findChildren("span")[0].text.lower()
|
||||||
|
t['files'] = []
|
||||||
|
file_div = soup.find("div", {"id":"files"})
|
||||||
|
for li in file_div.findChildren("li"):
|
||||||
|
f = re.sub('\s\(.*\)', '', li.text)
|
||||||
|
t["files"].append(f)
|
||||||
|
t['trackers'] = []
|
||||||
|
tracker_div = soup.find("div", {"id":"tracker-list"})
|
||||||
|
for tracker in tracker_div.findChildren("li"):
|
||||||
|
t['trackers'].append(tracker.text.strip())
|
||||||
|
if not 'imdbid' in t or t['imdbid'] == '':
|
||||||
|
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
|
||||||
|
if found is not None:
|
||||||
|
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
|
||||||
else:
|
else:
|
||||||
t['size'] = math.trunc(float(size) * 1073741824)
|
new_id = imdb_find(link[1])
|
||||||
t['seeders'] = soup.find("span", {"class": "seeds"}).text
|
if new_id is not None:
|
||||||
all_p = soup.find_all("p")
|
t['imdbid'] = f"tt{new_id}"
|
||||||
for p in all_p:
|
else:
|
||||||
if "Infohash :" in p.text:
|
print(f"{t['title']} has no IMDB Id")
|
||||||
t['infoHash'] = p.findChildren("span")[0].text.lower()
|
continue
|
||||||
t['files'] = []
|
build_and_write(t)
|
||||||
file_div = soup.find("div", {"id":"files"})
|
except:
|
||||||
for li in file_div.findChildren("li"):
|
counter += 1
|
||||||
f = re.sub('\s\(.*\)', '', li.text)
|
continue
|
||||||
t["files"].append(f)
|
|
||||||
t['trackers'] = []
|
|
||||||
tracker_div = soup.find("div", {"id":"tracker-list"})
|
|
||||||
for tracker in tracker_div.findChildren("li"):
|
|
||||||
t['trackers'].append(tracker.text.strip())
|
|
||||||
if not 'imdbid' in t or t['imdbid'] == '':
|
|
||||||
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
|
|
||||||
if found is not None:
|
|
||||||
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
|
|
||||||
else:
|
|
||||||
new_id = imdb_find(link[1])
|
|
||||||
if new_id is not None:
|
|
||||||
t['imdbid'] = f"tt{new_id}"
|
|
||||||
else:
|
|
||||||
print(f"{t['title']} has no IMDB Id")
|
|
||||||
continue
|
|
||||||
build_and_write(t)
|
|
||||||
counter += 1
|
counter += 1
|
||||||
Reference in New Issue
Block a user