diff --git a/lib/utils.js b/lib/utils.js index 5142716..b6c8a33 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -175,17 +175,122 @@ function reresolve (node, baseurl) { } exports.reresolve = reresolve; +var HTML_TAGS = new Set([ + 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', + 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button', + 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', + 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', + 'em', 'embed', + 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', + 'i', 'iframe', 'img', 'input', 'ins', 'isindex', + 'kbd', + 'label', 'legend', 'li', 'link', 'listing', + 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol', + 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript', + 'object', 'ol', 'optgroup', 'option', 'output', + 'p', 'param', 'picture', 'plaintext', 'pre', 'progress', + 'q', + 'rb', 'rp', 'rt', 'rtc', 'ruby', + 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', + 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', + 'thead', 'time', 'title', 'tr', 'track', 'tt', + 'u', 'ul', + 'var', 'video', + 'wbr', + 'xmp' +]); + +/* + * Scan markup starting at str[i] (which must be '<') and return its length + * and type if it is recognized markup, or null if it isn't. + * Recognized types: + * { alwaysStrip: true, len } - comments, doctypes, PIs + * { tagName, len } - opening or closing HTML tags + * + * Respects quoted attribute values so that an attribute like title="1 > 0" + * doesn't cause a premature close. + * + * @param {string} str + * @param {number} i + * @returns {Object|null} + * @private + */ +function readMarkupAt (str, i) { + // HTML comment: + if (str.slice(i, i + 4) === '', i + 4); + return commentEnd !== -1 ? { alwaysStrip: true, len: commentEnd + 3 - i } : null; + } + + // Processing instruction: + if (str[i + 1] === '?') { + var piEnd = str.indexOf('?>', i + 2); + return piEnd !== -1 ? { alwaysStrip: true, len: piEnd + 2 - i } : null; + } + + // Doctype / other + if (str[i + 1] === '!') { + var declEnd = str.indexOf('>', i + 2); + return declEnd !== -1 ? { alwaysStrip: true, len: declEnd + 1 - i } : null; + } + + // Closing tag or opening tag: or + var isClosing = str[i + 1] === '/'; + var j = isClosing ? i + 2 : i + 1; + var nameStart = j; + while (j < str.length) { + var code = str.charCodeAt(j); + var isLetter = (code >= 97 && code <= 122) || (code >= 65 && code <= 90); + var isDigit = code >= 48 && code <= 57; + if (j === nameStart ? !isLetter : !(isLetter || isDigit)) break; + j++; + } + var tagName = str.slice(nameStart, j).toLowerCase(); + if (!tagName) return null; + + // Scan for >, respecting quoted attribute values + var quote = null; + while (j < str.length) { + var ch = str[j]; + if (quote) { + if (ch === quote) quote = null; + } else if (ch === '"' || ch === '\'') { + quote = ch; + } else if (ch === '>') { + return { tagName: tagName, len: j + 1 - i }; + } + j++; + } + return null; // unclosed tag +} + /* - * Aggressivly strip HTML tags - * Pulled out of node-resanitize because it was all that was being used - * and it's way lighter... + * Strip HTML tags, leaving bare text content. + * Scans the string for markup - HTML tags, comments, doctypes, and processing + * instructions - and removes them. Only tags with known HTML element names + * are stripped; unknown angle-bracket content like <<>> is preserved. * * @param {string} str * @returns {string} * @private */ function stripHtml (str) { - return str.replace(/<.*?>/g, ''); + var out = ''; + var i = 0; + while (i < str.length) { + if (str[i] === '<') { + var markup = readMarkupAt(str, i); + if (markup && (markup.alwaysStrip || HTML_TAGS.has(markup.tagName))) { + i += markup.len; + continue; + } + } + out += str[i]; + i++; + } + return out; } +exports.HTML_TAGS = HTML_TAGS; exports.stripHtml = stripHtml; diff --git a/test/angle-brackets.js b/test/angle-brackets.js new file mode 100644 index 0000000..aa19a8e --- /dev/null +++ b/test/angle-brackets.js @@ -0,0 +1,55 @@ +const { Readable } = require('stream'); +describe('angle brackets in title', function () { + + var feeds = [ + ` + + +W3Schools Home Page +http://www.w3schools.com +Free web building tutorials + +RSS <<<Tutorial>>> +http://www.w3schools.com/xml/xml_rss.asp +New RSS tutorial on W3Schools + + + +`, ` + + +W3Schools Home Page +http://www.w3schools.com +Free web building tutorials + +RSS <<<Tutorial>>> +http://www.w3schools.com/xml/xml_rss.asp +New RSS tutorial on W3Schools + + + +`]; + + feeds.forEach(function (feed) { + it('should be properly decoded', function (done) { + var feedparser = new FeedParser(); + var titles = []; + Readable.from(feed).pipe(feedparser); + feedparser.on('readable', function () { + var item; + while ((item = this.read())) { + titles.push(item.title); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(titles[0], 'RSS <<>>'); + done(); + }); + }); + }); + +}); diff --git a/test/utils.js b/test/utils.js index 19dd2c1..bb7f69f 100644 --- a/test/utils.js +++ b/test/utils.js @@ -307,6 +307,14 @@ describe('utils', function () { describe('stripHtml', function () { + it('returns the string unchanged when there are no tags', function () { + assert.strictEqual(utils.stripHtml('plain text'), 'plain text'); + }); + + it('returns an empty string for an empty string', function () { + assert.strictEqual(utils.stripHtml(''), ''); + }); + it('removes simple HTML tags', function () { assert.strictEqual(utils.stripHtml('bold'), 'bold'); }); @@ -323,12 +331,122 @@ describe('utils', function () { assert.strictEqual(utils.stripHtml('link'), 'link'); }); - it('returns the string unchanged when there are no tags', function () { - assert.strictEqual(utils.stripHtml('plain text'), 'plain text'); + it('strips nested tags leaving inner text', function () { + assert.strictEqual(utils.stripHtml('
You MUST remove tags
'), 'You MUST remove tags'); }); - it('returns an empty string for an empty string', function () { - assert.strictEqual(utils.stripHtml(''), ''); + it('strips a self-closing tag without trailing slash', function () { + assert.strictEqual(utils.stripHtml('before
after'), 'beforeafter'); + }); + + it('strips a self-closing tag with trailing slash', function () { + assert.strictEqual(utils.stripHtml('before
after'), 'beforeafter'); + }); + + it('strips a self-closing tag with attributes', function () { + assert.strictEqual(utils.stripHtml('beforexafter'), 'beforeafter'); + }); + + it('strips img with self-closing slash and attributes', function () { + assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter'); + }); + + it('strips closing tags', function () { + assert.strictEqual(utils.stripHtml('text

'), 'text'); + }); + + it('strips tags with multiple attributes', function () { + assert.strictEqual(utils.stripHtml('
content
'), 'content'); + }); + + // --- edge cases: non-HTML angle brackets must be preserved --- + + it('preserves literal angle brackets that are not HTML tags', function () { + assert.strictEqual(utils.stripHtml('1 < 2'), '1 < 2'); + }); + + it('preserves less-than followed by a number', function () { + assert.strictEqual(utils.stripHtml('x<3'), 'x<3'); + }); + + it('preserves triple left angle brackets', function () { + assert.strictEqual(utils.stripHtml('<<<'), '<<<'); + }); + + it('preserves triple right angle brackets', function () { + assert.strictEqual(utils.stripHtml('>>>'), '>>>'); + }); + + it('preserves encoded angle brackets decoded by XML parser', function () { + assert.strictEqual(utils.stripHtml('RSS <<>>'), 'RSS <<>>'); + }); + + it('does not treat unknown tag names as HTML', function () { + assert.strictEqual(utils.stripHtml('a b'), 'a b'); + }); + + it('does not treat a closing tag with unknown name as HTML', function () { + assert.strictEqual(utils.stripHtml(''), ''); + }); + + it('preserves bare angle brackets adjacent to real HTML tags', function () { + assert.strictEqual(utils.stripHtml('<bold>'), ''); + }); + + it('strips real HTML but preserves non-HTML angle brackets in the same string', function () { + assert.strictEqual(utils.stripHtml('if a < b then yes else xz'), 'if a < b then yes else xz'); + }); + + it('strips non-void elements written as self-closing like
', function () { + assert.strictEqual(utils.stripHtml('
text
'), 'text'); + }); + + it('strips multiline tags spanning several lines', function () { + assert.strictEqual( + utils.stripHtml('Company Logo'), + '' + ); + }); + + it('strips HTML comments', function () { + assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter'); + }); + + it('strips multi-line HTML comments', function () { + assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter'); + }); + + it('strips doctype declarations', function () { + assert.strictEqual(utils.stripHtml('Title'), 'Title'); + }); + + it('strips XML processing instructions', function () { + assert.strictEqual(utils.stripHtml('text'), 'text'); + }); + + it('strips xml-stylesheet processing instructions', function () { + assert.strictEqual(utils.stripHtml('text'), 'text'); + }); + + it('strips mixed HTML tags, comments, and processing instructions in one string', function () { + assert.strictEqual( + utils.stripHtml('

Hello world

'), + 'Hello world' + ); + }); + + it('strips tags with double-quoted attribute values containing >', function () { + assert.strictEqual(utils.stripHtml('link'), 'link'); + }); + + it('strips tags with single-quoted attribute values containing >', function () { + assert.strictEqual(utils.stripHtml(' 0\'>link'), 'link'); + }); + + utils.HTML_TAGS.forEach(function (tag) { + it(`strips ${tag} HTML tag opening and closing and self-closing`, function () { + assert.strictEqual(utils.stripHtml('<' + tag + '>content and <' + tag + ' />more'), 'content and more', 'expected <' + tag + '> to be stripped'); + }); }); });