Wrapped try except for link processing

This commit is contained in:
Gabisonfire
2024-01-17 09:37:52 -05:00
parent a34e40fbc8
commit 364d1545b1
2 changed files with 58 additions and 53 deletions

View File

@@ -9,6 +9,7 @@ services:
- 'mongodb_data:/bitnami/mongodb' - 'mongodb_data:/bitnami/mongodb'
scraper: scraper:
build: ./ build: ./
restart: always
volumes: volumes:
- 'sqlite_data:/sqlite' - 'sqlite_data:/sqlite'
torrentio: torrentio:

View File

@@ -80,58 +80,62 @@ def process_links(links):
print(f"Checking links...({len(links)})") print(f"Checking links...({len(links)})")
counter = 1 counter = 1
for link in links: for link in links:
print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}") try:
req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'}) print(f"Processing: {BASE_URL}{link[0]} {counter}/{len(links)}")
torrent_html = req.text req = requests.get(f"{BASE_URL}{link[0]}", headers={'User-Agent': 'Mozilla/5.0'})
t = {} torrent_html = req.text
soup = BeautifulSoup(torrent_html, "html.parser") t = {}
t['title'] = soup.find("h1").text.strip() soup = BeautifulSoup(torrent_html, "html.parser")
t['size'] = 0 t['title'] = soup.find("h1").text.strip()
t['magnets'] = [] t['size'] = 0
t['torrents'] = [] t['magnets'] = []
all_a = soup.find_all("a") t['torrents'] = []
for a in all_a: all_a = soup.find_all("a")
if a.get("href").startswith("https://www.imdb.com/title"): for a in all_a:
t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1] if a.get("href").startswith("https://www.imdb.com/title"):
if a.get("href").startswith("magnet:"): t['imdbid'] = a.get("href").rstrip("\\").split('/')[-1]
t['magnets'].append(a.get("href")) if a.get("href").startswith("magnet:"):
if a.get("href").startswith(TORRENT_CACHES): t['magnets'].append(a.get("href"))
t['torrents'].append(a.get("href")) if a.get("href").startswith(TORRENT_CACHES):
all_li = soup.find_all("li") t['torrents'].append(a.get("href"))
for li in all_li: all_li = soup.find_all("li")
if "Total size" in li.text: for li in all_li:
size = li.findChildren("span")[0].text if "Total size" in li.text:
mb = False size = li.findChildren("span")[0].text
if "MB" in size: mb = True mb = False
size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','') if "MB" in size: mb = True
if mb: size = re.sub('\s(GB|MB)', '', size).split('.')[0].replace(',','')
t['size'] = math.trunc(float(size) * 107374182) if mb:
t['size'] = math.trunc(float(size) * 107374182)
else:
t['size'] = math.trunc(float(size) * 1073741824)
t['seeders'] = soup.find("span", {"class": "seeds"}).text
all_p = soup.find_all("p")
for p in all_p:
if "Infohash :" in p.text:
t['infoHash'] = p.findChildren("span")[0].text.lower()
t['files'] = []
file_div = soup.find("div", {"id":"files"})
for li in file_div.findChildren("li"):
f = re.sub('\s\(.*\)', '', li.text)
t["files"].append(f)
t['trackers'] = []
tracker_div = soup.find("div", {"id":"tracker-list"})
for tracker in tracker_div.findChildren("li"):
t['trackers'].append(tracker.text.strip())
if not 'imdbid' in t or t['imdbid'] == '':
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
if found is not None:
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
else: else:
t['size'] = math.trunc(float(size) * 1073741824) new_id = imdb_find(link[1])
t['seeders'] = soup.find("span", {"class": "seeds"}).text if new_id is not None:
all_p = soup.find_all("p") t['imdbid'] = f"tt{new_id}"
for p in all_p: else:
if "Infohash :" in p.text: print(f"{t['title']} has no IMDB Id")
t['infoHash'] = p.findChildren("span")[0].text.lower() continue
t['files'] = [] build_and_write(t)
file_div = soup.find("div", {"id":"files"}) except:
for li in file_div.findChildren("li"): counter += 1
f = re.sub('\s\(.*\)', '', li.text) continue
t["files"].append(f)
t['trackers'] = []
tracker_div = soup.find("div", {"id":"tracker-list"})
for tracker in tracker_div.findChildren("li"):
t['trackers'].append(tracker.text.strip())
if not 'imdbid' in t or t['imdbid'] == '':
found = re.search("https:\/\/www\.imdb\.com\/title\/tt\d+", torrent_html)
if found is not None:
t['imdbid'] = found.group(0).rstrip("\\").split('/')[-1]
else:
new_id = imdb_find(link[1])
if new_id is not None:
t['imdbid'] = f"tt{new_id}"
else:
print(f"{t['title']} has no IMDB Id")
continue
build_and_write(t)
counter += 1 counter += 1