From 3ca4479d8a9e03fe6578de7f9549f955435d510a Mon Sep 17 00:00:00 2001 From: TheBeastLT Date: Sat, 14 Mar 2020 22:54:44 +0100 Subject: [PATCH] [scraper] add html entities escape --- scraper/lib/metadata.js | 10 +++++++++- scraper/scrapers/1337x/1337x_api.js | 3 ++- .../scrapers/thepiratebay/thepiratebay_dump_scraper.js | 6 +++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/scraper/lib/metadata.js b/scraper/lib/metadata.js index 43042d6..f06177f 100644 --- a/scraper/lib/metadata.js +++ b/scraper/lib/metadata.js @@ -80,6 +80,14 @@ function escapeTitle(title) { .trim(); } +function escapeHTML(title) { + return title + .replace(/'|'/g, '\'') + .replace(/&|&/g, '&') + .replace(/!/g, '!') + .replace(/%/g, '%'); +} + async function getImdbId(info, type) { const name = escapeTitle(info.title); const year = info.year || info.date && info.date.slice(0, 4); @@ -121,4 +129,4 @@ async function getKitsuId(info) { })); } -module.exports = { getMetadata, getImdbId, getKitsuId }; +module.exports = { getMetadata, getImdbId, getKitsuId, escapeHTML }; diff --git a/scraper/scrapers/1337x/1337x_api.js b/scraper/scrapers/1337x/1337x_api.js index 61a544d..d58bb5b 100644 --- a/scraper/scrapers/1337x/1337x_api.js +++ b/scraper/scrapers/1337x/1337x_api.js @@ -3,6 +3,7 @@ const needle = require('needle'); const Sugar = require('sugar-date'); const decode = require('magnet-uri'); const Promises = require('../../lib/promises'); +const { escapeHTML } = require('../../lib/metadata'); const defaultProxies = [ 'https://1337x.to' @@ -117,7 +118,7 @@ function parseTorrentPage(body) { const imdbIdMatch = details.find('div[id=\'description\']').html().match(/imdb\.com\/title\/(tt\d+)/i); const torrent = { - name: decode(magnetLink).name.replace(/\+/g, ' '), + name: escapeHTML(decode(magnetLink).name.replace(/\+/g, ' ')), infoHash: decode(magnetLink).infoHash, magnetLink: magnetLink, seeders: parseInt(details.find('strong:contains(\'Seeders\')').next().text(), 10), diff --git a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js index 67eddd2..e72645d 100644 --- a/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js +++ b/scraper/scrapers/thepiratebay/thepiratebay_dump_scraper.js @@ -8,6 +8,7 @@ const thepiratebay = require('./thepiratebay_api.js'); const bing = require('nodejs-bing'); const { Type } = require('../../lib/types'); const repository = require('../../lib/repository'); +const { escapeHTML } = require('../../lib/metadata'); const { createTorrentEntry, createSkipTorrentEntry, getStoredTorrentEntry } = require('../../lib/torrentEntries'); const NAME = 'ThePirateBay'; @@ -42,10 +43,9 @@ async function scrape() { const torrent = { uploadDate: moment(row[0], 'YYYY-MMM-DD HH:mm:ss').toDate(), infoHash: Buffer.from(row[1], 'base64').toString('hex'), - title: row[2] + title: escapeHTML(row[2]) .replace(/^"|"$/g, '') - .replace(/&/g, '&') - .replace(/&\w{2,6};/g, ' ') + .replace(/&#?\w{2,6};/g, ' ') .replace(/\s+/g, ' ') .trim(), size: parseInt(row[3], 10)