refactor

wxiaoguang · wxiaoguang · commit b5065b70318f · 2022-02-26T23:13:12.000+08:00
diff --git a/modules/charset/escape.go b/modules/charset/escape.go
@@ -63,6 +63,7 @@ func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
 func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
 	buf := make([]byte, 4096)
 	readStart := 0
+	runeCount := 0
 	var n int
 	var writePos int
 
@@ -79,6 +80,8 @@ readingloop:
 
 		for i < len(bs) {
 			r, size := utf8.DecodeRune(bs[i:])
+			runeCount++
+
 			// Now handle the codepoints
 			switch {
 			case r == utf8.RuneError:
@@ -113,6 +116,8 @@ readingloop:
 				lineHasRTLScript = false
 				lineHasLTRScript = false
 
+			case runeCount == 1 && r == 0xFEFF: // UTF BOM
+				// the first BOM is safe
 			case r == '\r' || r == '\t' || r == ' ':
 				// These are acceptable control characters and space characters
 			case unicode.IsSpace(r):
@@ -144,8 +149,7 @@ readingloop:
 					return
 				}
 				writePos = i + size
-			// 65279 == BOM rune.
-			case r != rune(65279) && unicode.Is(unicode.C, r):
+			case unicode.Is(unicode.C, r):
 				escaped.Escaped = true
 				escaped.HasControls = true
 				if writePos < i {
diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go
@@ -130,18 +130,12 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
 		status: EscapeStatus{Escaped: true, HasBIDI: true, BadBIDI: true, HasLTRScript: true, HasRTLScript: true},
 	},
 	{
-		name:   "BOM encoding UTF-8",
-		text:   string([]byte{'\xef', '\xbb', '\xbf'}),
-		result: string([]byte{'\xef', '\xbb', '\xbf'}),
-		status: EscapeStatus{},
-	},
-	{
-		name:   "BOM encoding UTF-16",
-		text:   string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}),
-		result: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}),
-		status: EscapeStatus{
-			HasLTRScript: true,
-		},
+		// UTF-8/16/32 all use the same codepoint for BOM
+		// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
+		name:   "UTF BOM",
+		text:   "\xef\xbb\xbftest",
+		result: "\xef\xbb\xbftest",
+		status: EscapeStatus{HasLTRScript: true},
 	},
 }
 
@@ -177,19 +171,27 @@ func TestEscapeControlReader(t *testing.T) {
 	// lets add some control characters to the tests
 	tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
 	copy(tests, escapeControlTests)
+
+	// if there is a BOM, we should keep the BOM
+	addPrefix := func(prefix string, s string) string {
+		if strings.HasPrefix(s, "\xef\xbb\xbf") {
+			return s[:3] + prefix + s[3:]
+		}
+		return prefix + s
+	}
 	for _, test := range escapeControlTests {
 		test.name += " (+Control)"
-		test.text = "\u001E" + test.text
-		test.result = `<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">` + "\u001e" + `</span></span>` + test.result
+		test.text = addPrefix("\u001E", test.text)
+		test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">`+"\u001e"+`</span></span>`, test.result)
 		test.status.Escaped = true
 		test.status.HasControls = true
 		tests = append(tests, test)
 	}
 
 	for _, test := range escapeControlTests {
 		test.name += " (+Mark)"
-		test.text = "\u0300" + test.text
-		test.result = `<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">` + "\u0300" + `</span></span>` + test.result
+		test.text = addPrefix("\u0300", test.text)
+		test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">`+"\u0300"+`</span></span>`, test.result)
 		test.status.Escaped = true
 		test.status.HasMarks = true
 		tests = append(tests, test)