diff --git a/lib/utils.js b/lib/utils.js
index 5142716..b6c8a33 100644
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -175,17 +175,122 @@ function reresolve (node, baseurl) {
}
exports.reresolve = reresolve;
+var HTML_TAGS = new Set([
+ 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio',
+ 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt',
+ 'em', 'embed',
+ 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset',
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html',
+ 'i', 'iframe', 'img', 'input', 'ins', 'isindex',
+ 'kbd',
+ 'label', 'legend', 'li', 'link', 'listing',
+ 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol',
+ 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript',
+ 'object', 'ol', 'optgroup', 'option', 'output',
+ 'p', 'param', 'picture', 'plaintext', 'pre', 'progress',
+ 'q',
+ 'rb', 'rp', 'rt', 'rtc', 'ruby',
+ 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
+ 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th',
+ 'thead', 'time', 'title', 'tr', 'track', 'tt',
+ 'u', 'ul',
+ 'var', 'video',
+ 'wbr',
+ 'xmp'
+]);
+
+/*
+ * Scan markup starting at str[i] (which must be '<') and return its length
+ * and type if it is recognized markup, or null if it isn't.
+ * Recognized types:
+ * { alwaysStrip: true, len } - comments, doctypes, PIs
+ * { tagName, len } - opening or closing HTML tags
+ *
+ * Respects quoted attribute values so that an attribute like title="1 > 0"
+ * doesn't cause a premature close.
+ *
+ * @param {string} str
+ * @param {number} i
+ * @returns {Object|null}
+ * @private
+ */
+function readMarkupAt (str, i) {
+ // HTML comment:
+ if (str.slice(i, i + 4) === '', i + 4);
+ return commentEnd !== -1 ? { alwaysStrip: true, len: commentEnd + 3 - i } : null;
+ }
+
+ // Processing instruction: ... ?>
+ if (str[i + 1] === '?') {
+ var piEnd = str.indexOf('?>', i + 2);
+ return piEnd !== -1 ? { alwaysStrip: true, len: piEnd + 2 - i } : null;
+ }
+
+ // Doctype / other
+ if (str[i + 1] === '!') {
+ var declEnd = str.indexOf('>', i + 2);
+ return declEnd !== -1 ? { alwaysStrip: true, len: declEnd + 1 - i } : null;
+ }
+
+ // Closing tag or opening tag: or
+ var isClosing = str[i + 1] === '/';
+ var j = isClosing ? i + 2 : i + 1;
+ var nameStart = j;
+ while (j < str.length) {
+ var code = str.charCodeAt(j);
+ var isLetter = (code >= 97 && code <= 122) || (code >= 65 && code <= 90);
+ var isDigit = code >= 48 && code <= 57;
+ if (j === nameStart ? !isLetter : !(isLetter || isDigit)) break;
+ j++;
+ }
+ var tagName = str.slice(nameStart, j).toLowerCase();
+ if (!tagName) return null;
+
+ // Scan for >, respecting quoted attribute values
+ var quote = null;
+ while (j < str.length) {
+ var ch = str[j];
+ if (quote) {
+ if (ch === quote) quote = null;
+ } else if (ch === '"' || ch === '\'') {
+ quote = ch;
+ } else if (ch === '>') {
+ return { tagName: tagName, len: j + 1 - i };
+ }
+ j++;
+ }
+ return null; // unclosed tag
+}
+
/*
- * Aggressivly strip HTML tags
- * Pulled out of node-resanitize because it was all that was being used
- * and it's way lighter...
+ * Strip HTML tags, leaving bare text content.
+ * Scans the string for markup - HTML tags, comments, doctypes, and processing
+ * instructions - and removes them. Only tags with known HTML element names
+ * are stripped; unknown angle-bracket content like <<>> is preserved.
*
* @param {string} str
* @returns {string}
* @private
*/
function stripHtml (str) {
- return str.replace(/<.*?>/g, '');
+ var out = '';
+ var i = 0;
+ while (i < str.length) {
+ if (str[i] === '<') {
+ var markup = readMarkupAt(str, i);
+ if (markup && (markup.alwaysStrip || HTML_TAGS.has(markup.tagName))) {
+ i += markup.len;
+ continue;
+ }
+ }
+ out += str[i];
+ i++;
+ }
+ return out;
}
+exports.HTML_TAGS = HTML_TAGS;
exports.stripHtml = stripHtml;
diff --git a/test/angle-brackets.js b/test/angle-brackets.js
new file mode 100644
index 0000000..aa19a8e
--- /dev/null
+++ b/test/angle-brackets.js
@@ -0,0 +1,55 @@
+const { Readable } = require('stream');
+describe('angle brackets in title', function () {
+
+ var feeds = [
+ `
+
+
+W3Schools Home Page
+http://www.w3schools.com
+Free web building tutorials
+
+RSS <<<Tutorial>>>
+http://www.w3schools.com/xml/xml_rss.asp
+New RSS tutorial on W3Schools
+
+
+
+`, `
+
+
+W3Schools Home Page
+http://www.w3schools.com
+Free web building tutorials
+
+RSS <<<Tutorial>>>
+http://www.w3schools.com/xml/xml_rss.asp
+New RSS tutorial on W3Schools
+
+
+
+`];
+
+ feeds.forEach(function (feed) {
+ it('should be properly decoded', function (done) {
+ var feedparser = new FeedParser();
+ var titles = [];
+ Readable.from(feed).pipe(feedparser);
+ feedparser.on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ titles.push(item.title);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(titles[0], 'RSS <<>>');
+ done();
+ });
+ });
+ });
+
+});
diff --git a/test/utils.js b/test/utils.js
index 19dd2c1..bb7f69f 100644
--- a/test/utils.js
+++ b/test/utils.js
@@ -307,6 +307,14 @@ describe('utils', function () {
describe('stripHtml', function () {
+ it('returns the string unchanged when there are no tags', function () {
+ assert.strictEqual(utils.stripHtml('plain text'), 'plain text');
+ });
+
+ it('returns an empty string for an empty string', function () {
+ assert.strictEqual(utils.stripHtml(''), '');
+ });
+
it('removes simple HTML tags', function () {
assert.strictEqual(utils.stripHtml('bold'), 'bold');
});
@@ -323,12 +331,122 @@ describe('utils', function () {
assert.strictEqual(utils.stripHtml('link'), 'link');
});
- it('returns the string unchanged when there are no tags', function () {
- assert.strictEqual(utils.stripHtml('plain text'), 'plain text');
+ it('strips nested tags leaving inner text', function () {
+ assert.strictEqual(utils.stripHtml('
You MUST remove tags
'), 'You MUST remove tags');
});
- it('returns an empty string for an empty string', function () {
- assert.strictEqual(utils.stripHtml(''), '');
+ it('strips a self-closing tag without trailing slash', function () {
+ assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter');
+ });
+
+ it('strips a self-closing tag with trailing slash', function () {
+ assert.strictEqual(utils.stripHtml('before after'), 'beforeafter');
+ });
+
+ it('strips a self-closing tag with attributes', function () {
+ assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter');
+ });
+
+ it('strips img with self-closing slash and attributes', function () {
+ assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter');
+ });
+
+ it('strips closing tags', function () {
+ assert.strictEqual(utils.stripHtml('text
'), 'text');
+ });
+
+ it('strips tags with multiple attributes', function () {
+ assert.strictEqual(utils.stripHtml('
content
'), 'content');
+ });
+
+ // --- edge cases: non-HTML angle brackets must be preserved ---
+
+ it('preserves literal angle brackets that are not HTML tags', function () {
+ assert.strictEqual(utils.stripHtml('1 < 2'), '1 < 2');
+ });
+
+ it('preserves less-than followed by a number', function () {
+ assert.strictEqual(utils.stripHtml('x<3'), 'x<3');
+ });
+
+ it('preserves triple left angle brackets', function () {
+ assert.strictEqual(utils.stripHtml('<<<'), '<<<');
+ });
+
+ it('preserves triple right angle brackets', function () {
+ assert.strictEqual(utils.stripHtml('>>>'), '>>>');
+ });
+
+ it('preserves encoded angle brackets decoded by XML parser', function () {
+ assert.strictEqual(utils.stripHtml('RSS <<>>'), 'RSS <<>>');
+ });
+
+ it('does not treat unknown tag names as HTML', function () {
+ assert.strictEqual(utils.stripHtml('a b'), 'a b');
+ });
+
+ it('does not treat a closing tag with unknown name as HTML', function () {
+ assert.strictEqual(utils.stripHtml(''), '');
+ });
+
+ it('preserves bare angle brackets adjacent to real HTML tags', function () {
+ assert.strictEqual(utils.stripHtml('<bold>'), '');
+ });
+
+ it('strips real HTML but preserves non-HTML angle brackets in the same string', function () {
+ assert.strictEqual(utils.stripHtml('if a < b then yes else xz'), 'if a < b then yes else xz');
+ });
+
+ it('strips non-void elements written as self-closing like ', function () {
+ assert.strictEqual(utils.stripHtml('text'), 'text');
+ });
+
+ it('strips multiline tags spanning several lines', function () {
+ assert.strictEqual(
+ utils.stripHtml(''),
+ ''
+ );
+ });
+
+ it('strips HTML comments', function () {
+ assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter');
+ });
+
+ it('strips multi-line HTML comments', function () {
+ assert.strictEqual(utils.stripHtml('beforeafter'), 'beforeafter');
+ });
+
+ it('strips doctype declarations', function () {
+ assert.strictEqual(utils.stripHtml('Title'), 'Title');
+ });
+
+ it('strips XML processing instructions', function () {
+ assert.strictEqual(utils.stripHtml('text'), 'text');
+ });
+
+ it('strips xml-stylesheet processing instructions', function () {
+ assert.strictEqual(utils.stripHtml('text'), 'text');
+ });
+
+ it('strips mixed HTML tags, comments, and processing instructions in one string', function () {
+ assert.strictEqual(
+ utils.stripHtml('
Hello world
'),
+ 'Hello world'
+ );
+ });
+
+ it('strips tags with double-quoted attribute values containing >', function () {
+ assert.strictEqual(utils.stripHtml('link'), 'link');
+ });
+
+ it('strips tags with single-quoted attribute values containing >', function () {
+ assert.strictEqual(utils.stripHtml(' 0\'>link'), 'link');
+ });
+
+ utils.HTML_TAGS.forEach(function (tag) {
+ it(`strips ${tag} HTML tag opening and closing and self-closing`, function () {
+ assert.strictEqual(utils.stripHtml('<' + tag + '>content' + tag + '> and <' + tag + ' />more'), 'content and more', 'expected <' + tag + '> to be stripped');
+ });
});
});