Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ require (
github.com/spf13/viper v1.21.0
github.com/stretchr/testify v1.11.1
github.com/yosida95/uritemplate/v3 v3.0.2
github.com/yuin/goldmark v1.8.2
)

require (
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSW
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.8.2 h1:kEGpgqJXdgbkhcOgBxkC0X0PmoPG1ZyoZ117rDVp4zE=
github.com/yuin/goldmark v1.8.2/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand Down
113 changes: 111 additions & 2 deletions pkg/sanitize/sanitize.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,52 @@
// Package sanitize provides functions to sanitize untrusted user content from
// GitHub (issue titles, PR bodies, search results, etc.) before display.
//
// Threat model:
// - Input source: untrusted GitHub content that may contain malicious HTML/JS
// - Goal: strip HTML tags from prose without corrupting code samples
// - Defense: bluemonday HTML sanitizer + angle bracket protection in code blocks
//
// The sanitizer preserves angle brackets inside code (fenced blocks, inline spans,
// and indented blocks) because bluemonday treats <int>, <T>, etc. as unknown HTML
// tags and removes them. This would corrupt generic type syntax and other code.
package sanitize

import (
"bytes"
"strings"
"sync"
"unicode"

"github.com/microcosm-cc/bluemonday"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/text"
)

var policy *bluemonday.Policy
var policyOnce sync.Once

// Sanitize removes HTML tags and invisible characters from untrusted input.
//
// Ordering invariant: FilterInvisibleCharacters must run before
// protectCodeAngleBrackets to prevent sentinel collision attacks.
// FilterInvisibleCharacters strips NUL bytes (0x00), so an attacker cannot
// inject literal "\x00LT\x00script\x00GT\x00" strings that would bypass
// FilterHTMLTags and get restored to <script> by restoreCodeAngleBrackets.
func Sanitize(input string) string {
return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input)))
cleaned := FilterCodeFenceMetadata(FilterInvisibleCharacters(input))
// Protect angle brackets inside code blocks and inline code spans
// from being stripped by the HTML sanitizer. bluemonday treats <int>,
// <T>, etc. as unknown HTML tags and removes them.
// See https://github.com/github/github-mcp-server/issues/2202
protected := protectCodeAngleBrackets(cleaned)
sanitized := FilterHTMLTags(protected)
return restoreCodeAngleBrackets(sanitized)
Comment on lines 36 to +44
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: pull out the ordering as a named invariant so future readers don't accidentally reorder these calls. The security argument relies on FilterInvisibleCharacters running first (so NUL is stripped before sentinels are introduced) and on restoreCodeAngleBrackets running last (after FilterHTMLTags).

Suggested change
func Sanitize(input string) string {
return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input)))
cleaned := FilterCodeFenceMetadata(FilterInvisibleCharacters(input))
// Protect angle brackets inside code blocks and inline code spans
// from being stripped by the HTML sanitizer. bluemonday treats <int>,
// <T>, etc. as unknown HTML tags and removes them.
// See https://github.com/github/github-mcp-server/issues/2202
protected := protectCodeAngleBrackets(cleaned)
sanitized := FilterHTMLTags(protected)
return restoreCodeAngleBrackets(sanitized)
func Sanitize(input string) string {
// Ordering is security-load-bearing:
// 1. FilterInvisibleCharacters strips NUL bytes BEFORE sentinels are
// introduced, preventing an attacker from forging \x00LT\x00 in input.
// 2. protectCodeAngleBrackets replaces < and > inside code with sentinels
// so bluemonday does not strip them as HTML.
// 3. FilterHTMLTags (bluemonday) sanitizes prose; sentinels pass through.
// 4. restoreCodeAngleBrackets converts sentinels back to < and >.
// See https://github.com/github/github-mcp-server/issues/2202
cleaned := FilterCodeFenceMetadata(FilterInvisibleCharacters(input))
protected := protectCodeAngleBrackets(cleaned)
sanitized := FilterHTMLTags(protected)
return restoreCodeAngleBrackets(sanitized)
}

}

// FilterInvisibleCharacters removes invisible or control characters that should not appear
// in user-facing titles or bodies. This includes:
// - NUL bytes: U+0000 (prevents sentinel collision in protectCodeAngleBrackets)
// - Unicode tag characters: U+E0001, U+E0020–U+E007F
// - BiDi control characters: U+202A–U+202E, U+2066–U+2069
// - Hidden modifier characters: U+200B, U+200C, U+200E, U+200F, U+00AD, U+FEFF, U+180E, U+2060–U+2064
Expand Down Expand Up @@ -145,6 +175,84 @@ func isSafeCodeFenceToken(token string) bool {
return true
}

// Sentinels used to protect angle brackets inside code from HTML sanitization.
// NUL bytes are stripped by FilterInvisibleCharacters before protectCodeAngleBrackets
// runs, preventing sentinel collision attacks where an attacker injects literal
// "\x00LT\x00" strings to bypass FilterHTMLTags.
const (
ltSentinel = "\x00LT\x00"
gtSentinel = "\x00GT\x00"
)
Comment on lines +178 to +185
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth noting in the doc comment that the NUL-stripping in FilterInvisibleCharacters is what makes these sentinels collision-free — it's a non-obvious dependency.

Suggested change
// Sentinels used to protect angle brackets inside code from HTML sanitization.
// These are chosen to be unlikely to appear in real content.
const (
ltSentinel = "\x00LT\x00"
gtSentinel = "\x00GT\x00"
)
// Sentinels used to protect angle brackets inside code from HTML sanitization.
// They embed NUL bytes, which are stripped from all input by
// FilterInvisibleCharacters before this function runs — that strip is what
// makes the sentinels collision-free against attacker-controlled input.
const (
ltSentinel = "\x00LT\x00"
gtSentinel = "\x00GT\x00"
)


// protectCodeAngleBrackets replaces < and > inside code blocks and inline code
// spans with sentinels so bluemonday does not strip them as HTML.
//
// Uses goldmark's CommonMark parser to identify code regions. This correctly
// handles fenced blocks (```), inline spans (`), and indented (4-space) blocks.
func protectCodeAngleBrackets(input string) string {
src := []byte(input)
doc := goldmark.DefaultParser().Parse(text.NewReader(src))

type span struct{ start, stop int }
var spans []span

_ = ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if !entering {
return ast.WalkContinue, nil
}
switch n.Kind() {
case ast.KindFencedCodeBlock, ast.KindCodeBlock:
// Fenced (```) and indented (4-space) code blocks
lines := n.Lines()
for i := range lines.Len() {
seg := lines.At(i)
spans = append(spans, span{seg.Start, seg.Stop})
}
case ast.KindCodeSpan:
// Inline code: `...`
for c := n.FirstChild(); c != nil; c = c.NextSibling() {
if t, ok := c.(*ast.Text); ok {
seg := t.Segment
spans = append(spans, span{seg.Start, seg.Stop})
}
}
}
return ast.WalkContinue, nil
})

if len(spans) == 0 {
return input
}

var out bytes.Buffer
out.Grow(len(src))
cursor := 0
for _, s := range spans {
// Write text before this code span
out.Write(src[cursor:s.start])
// Protect angle brackets in the code span
for _, b := range src[s.start:s.stop] {
switch b {
case '<':
out.WriteString(ltSentinel)
case '>':
out.WriteString(gtSentinel)
default:
out.WriteByte(b)
}
}
cursor = s.stop
}
out.Write(src[cursor:])
return out.String()
}

// restoreCodeAngleBrackets converts sentinels back to angle brackets.
func restoreCodeAngleBrackets(input string) string {
s := strings.ReplaceAll(input, ltSentinel, "<")
return strings.ReplaceAll(s, gtSentinel, ">")
}

func getPolicy() *bluemonday.Policy {
policyOnce.Do(func() {
p := bluemonday.StrictPolicy()
Expand Down Expand Up @@ -175,7 +283,8 @@ func getPolicy() *bluemonday.Policy {

func shouldRemoveRune(r rune) bool {
switch r {
case 0x200B, // ZERO WIDTH SPACE
case 0x0000, // NUL — stripped to prevent sentinel collision in protectCodeAngleBrackets
0x200B, // ZERO WIDTH SPACE
0x200C, // ZERO WIDTH NON-JOINER
0x200E, // LEFT-TO-RIGHT MARK
0x200F, // RIGHT-TO-LEFT MARK
Expand Down
62 changes: 62 additions & 0 deletions pkg/sanitize/sanitize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ func TestShouldRemoveRune(t *testing.T) {
expected bool
}{
// Individual characters that should be removed
{name: "NUL byte", rune: 0x0000, expected: true},
{name: "zero width space", rune: 0x200B, expected: true},
{name: "zero width non-joiner", rune: 0x200C, expected: true},
{name: "left-to-right mark", rune: 0x200E, expected: true},
Expand Down Expand Up @@ -300,3 +301,64 @@ func TestSanitizeRemovesInvisibleCodeFenceMetadata(t *testing.T) {
result := Sanitize(input)
assert.Equal(t, expected, result)
}

func TestSanitizePreservesAngleBracketsInCodeBlocks(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "fenced code block with angle brackets",
input: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
expected: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
},
{
name: "inline code with angle brackets",
input: "Use `Vec<String>` for collections.",
expected: "Use `Vec<String>` for collections.",
},
{
name: "angle brackets outside code are sanitized",
input: "This has <script>alert('xss')</script> in it.",
expected: "This has in it.",
},
{
name: "fenced code block with generic types",
input: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.",
expected: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.",
},
{
name: "multiple inline code spans with angle brackets",
input: "Compare `Map<K, V>` and `Set<T>`.",
expected: "Compare `Map<K, V>` and `Set<T>`.",
},
{
name: "no code blocks passes through",
input: "No code here, just text.",
expected: "No code here, just text.",
},
{
name: "sentinel collision does not bypass sanitizer",
input: "\x00LT\x00script\x00GT\x00alert(1)\x00LT\x00/script\x00GT\x00",
expected: "LTscriptGTalert(1)LT/scriptGT", // NUL bytes stripped; sentinels don't match; no <script> injected
},
{
name: "NUL bytes stripped from input with code blocks",
input: "```\nfunc Foo\x00[T any]()\n```",
expected: "```\nfunc Foo[T any]()\n```",
},
{
name: "indented code block with angle brackets",
input: "Text\n\n let x: Vec<u8> = vec![];\n\nMore text",
expected: "Text\n\n let x: Vec<u8> = vec![];\n\nMore text",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := Sanitize(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}