[scraper] add html entities escape

This commit is contained in:
TheBeastLT
2020-03-14 22:54:44 +01:00
parent dbb3794c5f
commit 3ca4479d8a
3 changed files with 14 additions and 5 deletions

View File

@@ -80,6 +80,14 @@ function escapeTitle(title) {
.trim();
}
function escapeHTML(title) {
return title
.replace(/'|'/g, '\'')
.replace(/&|&/g, '&')
.replace(/!/g, '!')
.replace(/%/g, '%');
}
async function getImdbId(info, type) {
const name = escapeTitle(info.title);
const year = info.year || info.date && info.date.slice(0, 4);
@@ -121,4 +129,4 @@ async function getKitsuId(info) {
}));
}
module.exports = { getMetadata, getImdbId, getKitsuId };
module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML };

View File

@@ -3,6 +3,7 @@ const needle = require('needle');
const Sugar = require('sugar-date');
const decode = require('magnet-uri');
const Promises = require('../../lib/promises');
const { escapeHTML } = require('../../lib/metadata');
const defaultProxies = [
'https://1337x.to'
@@ -117,7 +118,7 @@ function parseTorrentPage(body) {
const imdbIdMatch = details.find('div[id=\'description\']').html().match(/imdb\.com\/title\/(tt\d+)/i);
const torrent = {
name: decode(magnetLink).name.replace(/\+/g, ' '),
name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')),
infoHash: decode(magnetLink).infoHash,
magnetLink: magnetLink,
seeders: parseInt(details.find('strong:contains(\'Seeders\')').next().text(), 10),

View File

@@ -8,6 +8,7 @@ const thepiratebay = require('./thepiratebay_api.js');
const bing = require('nodejs-bing');
const { Type } = require('../../lib/types');
const repository = require('../../lib/repository');
const { escapeHTML } = require('../../lib/metadata');
const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries');
const NAME = 'ThePirateBay';
@@ -42,10 +43,9 @@ async function scrape() {
const torrent = {
uploadDate: moment(row[0], 'YYYY-MMM-DD HH:mm:ss').toDate(),
infoHash: Buffer.from(row[1], 'base64').toString('hex'),
title: row[2]
title: escapeHTML(row[2])
.replace(/^"|"$/g, '')
.replace(/&/g, '&')
.replace(/&\w{2,6};/g, ' ')
.replace(/&#?\w{2,6};/g, ' ')
.replace(/\s+/g, ' ')
.trim(),
size: parseInt(row[3], 10)