Skip to content

refactor #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions modules/charset/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
buf := make([]byte, 4096)
readStart := 0
runeCount := 0
var n int
var writePos int

Expand All @@ -79,6 +80,8 @@ readingloop:

for i < len(bs) {
r, size := utf8.DecodeRune(bs[i:])
runeCount++

// Now handle the codepoints
switch {
case r == utf8.RuneError:
Expand Down Expand Up @@ -113,6 +116,8 @@ readingloop:
lineHasRTLScript = false
lineHasLTRScript = false

case runeCount == 1 && r == 0xFEFF: // UTF BOM
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use the i variable instead for this?

Suggested change
case runeCount == 1 && r == 0xFEFF: // UTF BOM
case i == 0 && r == 0xFEFF: // UTF BOM

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, i is inside another loop.

Only runeCount is the correct counter for the whole content.

// the first BOM is safe
case r == '\r' || r == '\t' || r == ' ':
// These are acceptable control characters and space characters
case unicode.IsSpace(r):
Expand Down Expand Up @@ -144,8 +149,7 @@ readingloop:
return
}
writePos = i + size
// 65279 == BOM rune.
case r != rune(65279) && unicode.Is(unicode.C, r):
case unicode.Is(unicode.C, r):
escaped.Escaped = true
escaped.HasControls = true
if writePos < i {
Expand Down
34 changes: 18 additions & 16 deletions modules/charset/escape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,12 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
status: EscapeStatus{Escaped: true, HasBIDI: true, BadBIDI: true, HasLTRScript: true, HasRTLScript: true},
},
{
name: "BOM encoding UTF-8",
text: string([]byte{'\xef', '\xbb', '\xbf'}),
result: string([]byte{'\xef', '\xbb', '\xbf'}),
status: EscapeStatus{},
},
{
name: "BOM encoding UTF-16",
text: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}),
result: string([]byte{239, 187, 191, 228, 189, 160, 229, 165, 189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 10, 104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 10}),
status: EscapeStatus{
HasLTRScript: true,
},
// UTF-8/16/32 all use the same codepoint for BOM
// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
name: "UTF BOM",
text: "\xef\xbb\xbftest",
result: "\xef\xbb\xbftest",
status: EscapeStatus{HasLTRScript: true},
},
}

Expand Down Expand Up @@ -177,19 +171,27 @@ func TestEscapeControlReader(t *testing.T) {
// lets add some control characters to the tests
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
copy(tests, escapeControlTests)

// if there is a BOM, we should keep the BOM
addPrefix := func(prefix string, s string) string {
if strings.HasPrefix(s, "\xef\xbb\xbf") {
return s[:3] + prefix + s[3:]
}
return prefix + s
}
for _, test := range escapeControlTests {
test.name += " (+Control)"
test.text = "\u001E" + test.text
test.result = `<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">` + "\u001e" + `</span></span>` + test.result
test.text = addPrefix("\u001E", test.text)
test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">`+"\u001e"+`</span></span>`, test.result)
test.status.Escaped = true
test.status.HasControls = true
tests = append(tests, test)
}

for _, test := range escapeControlTests {
test.name += " (+Mark)"
test.text = "\u0300" + test.text
test.result = `<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">` + "\u0300" + `</span></span>` + test.result
test.text = addPrefix("\u0300", test.text)
test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+0300]"><span class="char">`+"\u0300"+`</span></span>`, test.result)
test.status.Escaped = true
test.status.HasMarks = true
tests = append(tests, test)
Expand Down