Add basic content sanitizer (#1344)

JoannaaKL · web-flow · commit 304f29a2fd33 · 2025-10-31T12:51:22.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,6 @@ bin/
 .DS_Store
 
 # binary
-github-mcp-server
+github-mcp-server
+
+.history
diff --git a/pkg/github/issues.go b/pkg/github/issues.go
@@ -10,6 +10,7 @@ import (
 	"time"
 
 	ghErrors "github.com/github/github-mcp-server/pkg/errors"
+	"github.com/github/github-mcp-server/pkg/sanitize"
 	"github.com/github/github-mcp-server/pkg/translations"
 	"github.com/go-viper/mapstructure/v2"
 	"github.com/google/go-github/v76/github"
@@ -211,15 +212,15 @@ func fragmentToIssue(fragment IssueFragment) *github.Issue {
 
 	return &github.Issue{
 		Number:    github.Ptr(int(fragment.Number)),
-		Title:     github.Ptr(string(fragment.Title)),
+		Title:     github.Ptr(sanitize.FilterInvisibleCharacters(string(fragment.Title))),
 		CreatedAt: &github.Timestamp{Time: fragment.CreatedAt.Time},
 		UpdatedAt: &github.Timestamp{Time: fragment.UpdatedAt.Time},
 		User: &github.User{
 			Login: github.Ptr(string(fragment.Author.Login)),
 		},
 		State:    github.Ptr(string(fragment.State)),
 		ID:       github.Ptr(fragment.DatabaseID),
-		Body:     github.Ptr(string(fragment.Body)),
+		Body:     github.Ptr(sanitize.FilterInvisibleCharacters(string(fragment.Body))),
 		Labels:   foundLabels,
 		Comments: github.Ptr(int(fragment.Comments.TotalCount)),
 	}
@@ -323,6 +324,16 @@ func GetIssue(ctx context.Context, client *github.Client, owner string, repo str
 		return mcp.NewToolResultError(fmt.Sprintf("failed to get issue: %s", string(body))), nil
 	}
 
+	// Sanitize title/body on response
+	if issue != nil {
+		if issue.Title != nil {
+			issue.Title = github.Ptr(sanitize.FilterInvisibleCharacters(*issue.Title))
+		}
+		if issue.Body != nil {
+			issue.Body = github.Ptr(sanitize.FilterInvisibleCharacters(*issue.Body))
+		}
+	}
+
 	r, err := json.Marshal(issue)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal issue: %w", err)
diff --git a/pkg/github/pullrequests.go b/pkg/github/pullrequests.go
@@ -14,6 +14,7 @@ import (
 	"github.com/shurcooL/githubv4"
 
 	ghErrors "github.com/github/github-mcp-server/pkg/errors"
+	"github.com/github/github-mcp-server/pkg/sanitize"
 	"github.com/github/github-mcp-server/pkg/translations"
 )
 
@@ -123,6 +124,16 @@ func GetPullRequest(ctx context.Context, client *github.Client, owner, repo stri
 		return mcp.NewToolResultError(fmt.Sprintf("failed to get pull request: %s", string(body))), nil
 	}
 
+	// sanitize title/body on response
+	if pr != nil {
+		if pr.Title != nil {
+			pr.Title = github.Ptr(sanitize.FilterInvisibleCharacters(*pr.Title))
+		}
+		if pr.Body != nil {
+			pr.Body = github.Ptr(sanitize.FilterInvisibleCharacters(*pr.Body))
+		}
+	}
+
 	r, err := json.Marshal(pr)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal response: %w", err)
@@ -804,6 +815,19 @@ func ListPullRequests(getClient GetClientFn, t translations.TranslationHelperFun
 				return mcp.NewToolResultError(fmt.Sprintf("failed to list pull requests: %s", string(body))), nil
 			}
 
+			// sanitize title/body on each PR
+			for _, pr := range prs {
+				if pr == nil {
+					continue
+				}
+				if pr.Title != nil {
+					pr.Title = github.Ptr(sanitize.FilterInvisibleCharacters(*pr.Title))
+				}
+				if pr.Body != nil {
+					pr.Body = github.Ptr(sanitize.FilterInvisibleCharacters(*pr.Body))
+				}
+			}
+
 			r, err := json.Marshal(prs)
 			if err != nil {
 				return nil, fmt.Errorf("failed to marshal response: %w", err)
diff --git a/pkg/sanitize/sanitize.go b/pkg/sanitize/sanitize.go
@@ -0,0 +1,56 @@
+package sanitize
+
+// FilterInvisibleCharacters removes invisible or control characters that should not appear
+// in user-facing titles or bodies. This includes:
+// - Unicode tag characters: U+E0001, U+E0020–U+E007F
+// - BiDi control characters: U+202A–U+202E, U+2066–U+2069
+// - Hidden modifier characters: U+200B, U+200C, U+200E, U+200F, U+00AD, U+FEFF, U+180E, U+2060–U+2064
+func FilterInvisibleCharacters(input string) string {
+	if input == "" {
+		return input
+	}
+
+	// Filter runes
+	out := make([]rune, 0, len(input))
+	for _, r := range input {
+		if !shouldRemoveRune(r) {
+			out = append(out, r)
+		}
+	}
+	return string(out)
+}
+
+func shouldRemoveRune(r rune) bool {
+	switch r {
+	case 0x200B, // ZERO WIDTH SPACE
+		0x200C, // ZERO WIDTH NON-JOINER
+		0x200E, // LEFT-TO-RIGHT MARK
+		0x200F, // RIGHT-TO-LEFT MARK
+		0x00AD, // SOFT HYPHEN
+		0xFEFF, // ZERO WIDTH NO-BREAK SPACE
+		0x180E: // MONGOLIAN VOWEL SEPARATOR
+		return true
+	case 0xE0001: // TAG
+		return true
+	}
+
+	// Ranges
+	// Unicode tags: U+E0020–U+E007F
+	if r >= 0xE0020 && r <= 0xE007F {
+		return true
+	}
+	// BiDi controls: U+202A–U+202E
+	if r >= 0x202A && r <= 0x202E {
+		return true
+	}
+	// BiDi isolates: U+2066–U+2069
+	if r >= 0x2066 && r <= 0x2069 {
+		return true
+	}
+	// Hidden modifiers: U+2060–U+2064
+	if r >= 0x2060 && r <= 0x2064 {
+		return true
+	}
+
+	return false
+}
diff --git a/pkg/sanitize/sanitize_test.go b/pkg/sanitize/sanitize_test.go
@@ -0,0 +1,188 @@
+package sanitize
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestFilterInvisibleCharacters(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "empty string",
+			input:    "",
+			expected: "",
+		},
+		{
+			name:     "normal text without invisible characters",
+			input:    "Hello World",
+			expected: "Hello World",
+		},
+		{
+			name:     "text with zero width space",
+			input:    "Hello\u200BWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with zero width non-joiner",
+			input:    "Hello\u200CWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with left-to-right mark",
+			input:    "Hello\u200EWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with right-to-left mark",
+			input:    "Hello\u200FWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with soft hyphen",
+			input:    "Hello\u00ADWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with zero width no-break space (BOM)",
+			input:    "Hello\uFEFFWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with mongolian vowel separator",
+			input:    "Hello\u180EWorld",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with unicode tag character",
+			input:    "Hello\U000E0001World",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with unicode tag range characters",
+			input:    "Hello\U000E0020World\U000E007FTest",
+			expected: "HelloWorldTest",
+		},
+		{
+			name:     "text with bidi control characters",
+			input:    "Hello\u202AWorld\u202BTest\u202CEnd\u202DMore\u202EFinal",
+			expected: "HelloWorldTestEndMoreFinal",
+		},
+		{
+			name:     "text with bidi isolate characters",
+			input:    "Hello\u2066World\u2067Test\u2068End\u2069Final",
+			expected: "HelloWorldTestEndFinal",
+		},
+		{
+			name:     "text with hidden modifier characters",
+			input:    "Hello\u2060World\u2061Test\u2062End\u2063More\u2064Final",
+			expected: "HelloWorldTestEndMoreFinal",
+		},
+		{
+			name:     "multiple invisible characters mixed",
+			input:    "Hello\u200B\u200C\u200E\u200F\u00AD\uFEFF\u180E\U000E0001World",
+			expected: "HelloWorld",
+		},
+		{
+			name:     "text with normal unicode characters (should be preserved)",
+			input:    "Hello 世界 🌍 αβγ",
+			expected: "Hello 世界 🌍 αβγ",
+		},
+		{
+			name:     "invisible characters at start and end",
+			input:    "\u200BHello World\u200C",
+			expected: "Hello World",
+		},
+		{
+			name:     "only invisible characters",
+			input:    "\u200B\u200C\u200E\u200F",
+			expected: "",
+		},
+		{
+			name:     "real-world example with title",
+			input:    "Fix\u200B bug\u00AD in\u202A authentication\u202C",
+			expected: "Fix bug in authentication",
+		},
+		{
+			name:     "issue body with mixed content",
+			input:    "This is a\u200B bug report.\n\nSteps to reproduce:\u200C\n1. Do this\u200E\n2. Do that\u200F",
+			expected: "This is a bug report.\n\nSteps to reproduce:\n1. Do this\n2. Do that",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := FilterInvisibleCharacters(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestShouldRemoveRune(t *testing.T) {
+	tests := []struct {
+		name     string
+		rune     rune
+		expected bool
+	}{
+		// Individual characters that should be removed
+		{name: "zero width space", rune: 0x200B, expected: true},
+		{name: "zero width non-joiner", rune: 0x200C, expected: true},
+		{name: "left-to-right mark", rune: 0x200E, expected: true},
+		{name: "right-to-left mark", rune: 0x200F, expected: true},
+		{name: "soft hyphen", rune: 0x00AD, expected: true},
+		{name: "zero width no-break space", rune: 0xFEFF, expected: true},
+		{name: "mongolian vowel separator", rune: 0x180E, expected: true},
+		{name: "unicode tag", rune: 0xE0001, expected: true},
+
+		// Range tests - Unicode tags: U+E0020–U+E007F
+		{name: "unicode tag range start", rune: 0xE0020, expected: true},
+		{name: "unicode tag range middle", rune: 0xE0050, expected: true},
+		{name: "unicode tag range end", rune: 0xE007F, expected: true},
+		{name: "before unicode tag range", rune: 0xE001F, expected: false},
+		{name: "after unicode tag range", rune: 0xE0080, expected: false},
+
+		// Range tests - BiDi controls: U+202A–U+202E
+		{name: "bidi control range start", rune: 0x202A, expected: true},
+		{name: "bidi control range middle", rune: 0x202C, expected: true},
+		{name: "bidi control range end", rune: 0x202E, expected: true},
+		{name: "before bidi control range", rune: 0x2029, expected: false},
+		{name: "after bidi control range", rune: 0x202F, expected: false},
+
+		// Range tests - BiDi isolates: U+2066–U+2069
+		{name: "bidi isolate range start", rune: 0x2066, expected: true},
+		{name: "bidi isolate range middle", rune: 0x2067, expected: true},
+		{name: "bidi isolate range end", rune: 0x2069, expected: true},
+		{name: "before bidi isolate range", rune: 0x2065, expected: false},
+		{name: "after bidi isolate range", rune: 0x206A, expected: false},
+
+		// Range tests - Hidden modifiers: U+2060–U+2064
+		{name: "hidden modifier range start", rune: 0x2060, expected: true},
+		{name: "hidden modifier range middle", rune: 0x2062, expected: true},
+		{name: "hidden modifier range end", rune: 0x2064, expected: true},
+		{name: "before hidden modifier range", rune: 0x205F, expected: false},
+		{name: "after hidden modifier range", rune: 0x2065, expected: false},
+
+		// Characters that should NOT be removed
+		{name: "regular ascii letter", rune: 'A', expected: false},
+		{name: "regular ascii digit", rune: '1', expected: false},
+		{name: "regular ascii space", rune: ' ', expected: false},
+		{name: "newline", rune: '\n', expected: false},
+		{name: "tab", rune: '\t', expected: false},
+		{name: "unicode letter", rune: '世', expected: false},
+		{name: "emoji", rune: '🌍', expected: false},
+		{name: "greek letter", rune: 'α', expected: false},
+		{name: "punctuation", rune: '.', expected: false},
+		{name: "hyphen (normal)", rune: '-', expected: false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := shouldRemoveRune(tt.rune)
+			assert.Equal(t, tt.expected, result, "rune: U+%04X (%c)", tt.rune, tt.rune)
+		})
+	}
+}