Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 109 additions & 4 deletions lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -175,17 +175,122 @@ function reresolve (node, baseurl) {
}
exports.reresolve = reresolve;

var HTML_TAGS = new Set([
'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio',
'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt',
'em', 'embed',
'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html',
'i', 'iframe', 'img', 'input', 'ins', 'isindex',
'kbd',
'label', 'legend', 'li', 'link', 'listing',
'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol',
'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript',
'object', 'ol', 'optgroup', 'option', 'output',
'p', 'param', 'picture', 'plaintext', 'pre', 'progress',
'q',
'rb', 'rp', 'rt', 'rtc', 'ruby',
's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th',
'thead', 'time', 'title', 'tr', 'track', 'tt',
'u', 'ul',
'var', 'video',
'wbr',
'xmp'
]);

/*
* Scan markup starting at str[i] (which must be '<') and return its length
* and type if it is recognized markup, or null if it isn't.
* Recognized types:
* { alwaysStrip: true, len } - comments, doctypes, PIs
* { tagName, len } - opening or closing HTML tags
*
* Respects quoted attribute values so that an attribute like title="1 > 0"
* doesn't cause a premature close.
*
* @param {string} str
* @param {number} i
* @returns {Object|null}
* @private
*/
function readMarkupAt (str, i) {
// HTML comment: <!-- ... -->
if (str.slice(i, i + 4) === '<!--') {
var commentEnd = str.indexOf('-->', i + 4);
return commentEnd !== -1 ? { alwaysStrip: true, len: commentEnd + 3 - i } : null;
}

// Processing instruction: <? ... ?>
if (str[i + 1] === '?') {
var piEnd = str.indexOf('?>', i + 2);
return piEnd !== -1 ? { alwaysStrip: true, len: piEnd + 2 - i } : null;
}

// Doctype / other <! declarations: <! ... >
if (str[i + 1] === '!') {
var declEnd = str.indexOf('>', i + 2);
return declEnd !== -1 ? { alwaysStrip: true, len: declEnd + 1 - i } : null;
}

// Closing tag or opening tag: </tagName ...> or <tagName ...>
var isClosing = str[i + 1] === '/';
var j = isClosing ? i + 2 : i + 1;
var nameStart = j;
while (j < str.length) {
var code = str.charCodeAt(j);
var isLetter = (code >= 97 && code <= 122) || (code >= 65 && code <= 90);
var isDigit = code >= 48 && code <= 57;
if (j === nameStart ? !isLetter : !(isLetter || isDigit)) break;
j++;
}
var tagName = str.slice(nameStart, j).toLowerCase();
if (!tagName) return null;

// Scan for >, respecting quoted attribute values
var quote = null;
while (j < str.length) {
var ch = str[j];
if (quote) {
if (ch === quote) quote = null;
} else if (ch === '"' || ch === '\'') {
quote = ch;
} else if (ch === '>') {
return { tagName: tagName, len: j + 1 - i };
}
j++;
}
return null; // unclosed tag
}

/*
* Aggressivly strip HTML tags
* Pulled out of node-resanitize because it was all that was being used
* and it's way lighter...
* Strip HTML tags, leaving bare text content.
* Scans the string for markup - HTML tags, comments, doctypes, and processing
* instructions - and removes them. Only tags with known HTML element names
* are stripped; unknown angle-bracket content like <<<NotHTML>>> is preserved.
*
* @param {string} str
* @returns {string}
* @private
*/
function stripHtml (str) {
return str.replace(/<.*?>/g, '');
var out = '';
var i = 0;
while (i < str.length) {
if (str[i] === '<') {
var markup = readMarkupAt(str, i);
if (markup && (markup.alwaysStrip || HTML_TAGS.has(markup.tagName))) {
i += markup.len;
continue;
}
}
out += str[i];
i++;
}
return out;
}

exports.HTML_TAGS = HTML_TAGS;
exports.stripHtml = stripHtml;
55 changes: 55 additions & 0 deletions test/angle-brackets.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
const { Readable } = require('stream');
describe('angle brackets in title', function () {

var feeds = [
`<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>W3Schools Home Page</title>
<link>http://www.w3schools.com</link>
<description>Free web building tutorials</description>
<item>
<title>RSS &#x3C;&#x3C;&#x3C;Tutorial&#x3E;&#x3E;&#x3E;</title>
<link>http://www.w3schools.com/xml/xml_rss.asp</link>
<description>New RSS tutorial on W3Schools</description>
</item>
</channel>
</rss>
`, `<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>W3Schools Home Page</title>
<link>http://www.w3schools.com</link>
<description>Free web building tutorials</description>
<item>
<title>RSS &lt;&lt;&lt;Tutorial&gt;&gt;&gt;</title>
<link>http://www.w3schools.com/xml/xml_rss.asp</link>
<description>New RSS tutorial on W3Schools</description>
</item>
</channel>
</rss>
`];

feeds.forEach(function (feed) {
it('should be properly decoded', function (done) {
var feedparser = new FeedParser();
var titles = [];
Readable.from(feed).pipe(feedparser);
feedparser.on('readable', function () {
var item;
while ((item = this.read())) {
titles.push(item.title);
}
})
.on('error', function (err) {
assert.ifError(err);
done(err);
})
.on('end', function () {
assert.equal(titles[0], 'RSS <<<Tutorial>>>');
done();
});
});
});

});
126 changes: 122 additions & 4 deletions test/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,14 @@ describe('utils', function () {

describe('stripHtml', function () {

it('returns the string unchanged when there are no tags', function () {
assert.strictEqual(utils.stripHtml('plain text'), 'plain text');
});

it('returns an empty string for an empty string', function () {
assert.strictEqual(utils.stripHtml(''), '');
});

it('removes simple HTML tags', function () {
assert.strictEqual(utils.stripHtml('<b>bold</b>'), 'bold');
});
Expand All @@ -323,12 +331,122 @@ describe('utils', function () {
assert.strictEqual(utils.stripHtml('<a href="http://example.com">link</a>'), 'link');
});

it('returns the string unchanged when there are no tags', function () {
assert.strictEqual(utils.stripHtml('plain text'), 'plain text');
it('strips nested tags leaving inner text', function () {
assert.strictEqual(utils.stripHtml('<div>You <strong>MUST</strong> remove tags</div>'), 'You MUST remove tags');
});

it('returns an empty string for an empty string', function () {
assert.strictEqual(utils.stripHtml(''), '');
it('strips a self-closing tag without trailing slash', function () {
assert.strictEqual(utils.stripHtml('before<hr>after'), 'beforeafter');
});

it('strips a self-closing tag with trailing slash', function () {
assert.strictEqual(utils.stripHtml('before<br/>after'), 'beforeafter');
});

it('strips a self-closing tag with attributes', function () {
assert.strictEqual(utils.stripHtml('before<img src="x.png" alt="x">after'), 'beforeafter');
});

it('strips img with self-closing slash and attributes', function () {
assert.strictEqual(utils.stripHtml('before<img src="x.png" />after'), 'beforeafter');
});

it('strips closing tags', function () {
assert.strictEqual(utils.stripHtml('text</p>'), 'text');
});

it('strips tags with multiple attributes', function () {
assert.strictEqual(utils.stripHtml('<div class="foo" id="bar">content</div>'), 'content');
});

// --- edge cases: non-HTML angle brackets must be preserved ---

it('preserves literal angle brackets that are not HTML tags', function () {
assert.strictEqual(utils.stripHtml('1 < 2'), '1 < 2');
});

it('preserves less-than followed by a number', function () {
assert.strictEqual(utils.stripHtml('x<3'), 'x<3');
});

it('preserves triple left angle brackets', function () {
assert.strictEqual(utils.stripHtml('<<<'), '<<<');
});

it('preserves triple right angle brackets', function () {
assert.strictEqual(utils.stripHtml('>>>'), '>>>');
});

it('preserves encoded angle brackets decoded by XML parser', function () {
assert.strictEqual(utils.stripHtml('RSS <<<Tutorial>>>'), 'RSS <<<Tutorial>>>');
});

it('does not treat unknown tag names as HTML', function () {
assert.strictEqual(utils.stripHtml('a <foo> b'), 'a <foo> b');
});

it('does not treat a closing tag with unknown name as HTML', function () {
assert.strictEqual(utils.stripHtml('</foo>'), '</foo>');
});

it('preserves bare angle brackets adjacent to real HTML tags', function () {
assert.strictEqual(utils.stripHtml('<<b>bold</b>>'), '<bold>');
});

it('strips real HTML but preserves non-HTML angle brackets in the same string', function () {
assert.strictEqual(utils.stripHtml('if a < b then <em>yes</em> else x<y>z'), 'if a < b then yes else x<y>z');
});

it('strips non-void elements written as self-closing like <div />', function () {
assert.strictEqual(utils.stripHtml('<div />text</div>'), 'text');
});

it('strips multiline tags spanning several lines', function () {
assert.strictEqual(
utils.stripHtml('<img \n src="logo.png" \n alt="Company Logo" \n width="200" \n height="100"\n>'),
''
);
});

it('strips HTML comments', function () {
assert.strictEqual(utils.stripHtml('before<!-- comment -->after'), 'beforeafter');
});

it('strips multi-line HTML comments', function () {
assert.strictEqual(utils.stripHtml('before<!-- a\ncomment -->after'), 'beforeafter');
});

it('strips doctype declarations', function () {
assert.strictEqual(utils.stripHtml('<!DOCTYPE html>Title'), 'Title');
});

it('strips XML processing instructions', function () {
assert.strictEqual(utils.stripHtml('<?xml version="1.0"?>text'), 'text');
});

it('strips xml-stylesheet processing instructions', function () {
assert.strictEqual(utils.stripHtml('<?xml-stylesheet href="style.xsl" type="text/xsl"?>text'), 'text');
});

it('strips mixed HTML tags, comments, and processing instructions in one string', function () {
assert.strictEqual(
utils.stripHtml('<?xml version="1.0"?><p><!-- intro -->Hello <em>world</em></p>'),
'Hello world'
);
});

it('strips tags with double-quoted attribute values containing >', function () {
assert.strictEqual(utils.stripHtml('<a title="1 > 0">link</a>'), 'link');
});

it('strips tags with single-quoted attribute values containing >', function () {
assert.strictEqual(utils.stripHtml('<a title=\'1 > 0\'>link</a>'), 'link');
});

utils.HTML_TAGS.forEach(function (tag) {
it(`strips ${tag} HTML tag opening and closing and self-closing`, function () {
assert.strictEqual(utils.stripHtml('<' + tag + '>content</' + tag + '> and <' + tag + ' />more'), 'content and more', 'expected <' + tag + '> to be stripped');
});
});

});
Expand Down