fix comando api scrapping with different formatting

This commit is contained in:
TheBeastLT
2021-09-15 10:01:50 +02:00
committed by TheBeastLT
parent 0bda72e19c
commit 211003a1dc
3 changed files with 10 additions and 9 deletions

View File

@@ -5,7 +5,7 @@ const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtLanguages } = require('../scraperHelper')
const { isPtDubbed, sanitizePtName, sanitizePtLanguages, sanitizePtOriginalName } = require('../scraperHelper')
const defaultTimeout = 30000;
const maxSearchPage = 50
@@ -111,22 +111,24 @@ function parseTorrentPage(body) {
.map((i, elem) => $(elem).nextUntil('h2, hr'))
.map((i, elem) => $(elem).find('a[href^="magnet"]'))
.map((i, section) => $(section).attr("href")).get();
const details = $('b:contains(\'Original\'), strong:contains(\'Original\')').parent()
const details = $('b:contains(\'Servidor\'), b:contains(\'Original\')').parent()
const imdbIdMatch = details.find('a[href*="imdb.com"]').attr('href')
const torrents = magnets.map(magnetLink => {
const originalName = details.find('strong:contains(\'Original\')').next().text().trim() ||
details.find('b:contains(\'Original\'), strong:contains(\'Original\')')[0].nextSibling.nodeValue;
const originalNameElem = details.find('strong, b')
.filter((i, elem) => $(elem).text().match(/Baixar|Orig(?:\.|inal)/));
const languagesElem = details.find('strong, b')
.filter((i, elem) => $(elem).text().match(/^\s*(Idioma|[AÁ]udio)/));
const originalName = originalNameElem.next().text().trim() || originalNameElem[0].nextSibling.nodeValue;
return {
title: sanitizePtName(escapeHTML(decode(magnetLink).name.replace(/\+/g, ' '))),
originalName: originalName.replace(/: ?/, '').trim(),
originalName: sanitizePtOriginalName(originalName.replace(/: ?/, '')),
year: details.find('a[href*="comando.to/category/"]').text(),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
category: parseCategory($('div.entry-categories').html()),
uploadDate: new Date(moment($('a.updated').text(), 'LL', 'pt-br').format()),
imdbId: imdbIdMatch ? imdbIdMatch.split('/')[4] : null,
languages: sanitizePtLanguages(details.find(
'b:contains(\'Idioma\'), b:contains(\'Audio\'), b:contains(\'Áudio\')')[0].nextSibling.nodeValue)
languages: sanitizePtLanguages(languagesElem[0].nextSibling.nodeValue)
}
});
resolve(torrents.filter((x) => x));

View File

@@ -1,7 +1,6 @@
const needle = require("needle")
const cheerio = require("cheerio");
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { escapeHTML } = require('../../lib/metadata');
const { getRandomUserAgent } = require('../../lib/requestHelper');
const { isPtDubbed, sanitizePtName, sanitizePtLanguages } = require('../scraperHelper')

View File

@@ -11,7 +11,7 @@ function sanitizePtName(name) {
}
function sanitizePtOriginalName(name) {
return name.trim().replace(/S\d+$/, '');
return name.trim().replace(/S\d+$|\d.\s?[Tt]emporada/, '');
}
function sanitizePtLanguages(languages) {