Skip to content

Commit b564612

Browse files
committed
Detect truncated utf-8 characters at the end of content as still representing utf-8
Our character detection algorithm can potentially incorrectly detect utf-8 as iso-8859-x if there is a truncated character at the end of the partially read file. This PR changes the detection algorithm to truncated utf8 characters at the end of the buffer. Fix go-gitea#19743 Signed-off-by: Andrew Thornton <[email protected]>
1 parent a9af93c commit b564612

File tree

1 file changed

+20
-1
lines changed

1 file changed

+20
-1
lines changed

modules/charset/charset.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,26 @@ func RemoveBOMIfPresent(content []byte) []byte {
131131

132132
// DetectEncoding detect the encoding of content
133133
func DetectEncoding(content []byte) (string, error) {
134-
if utf8.Valid(content) {
134+
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
135+
136+
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
137+
// instead we walk backwards from the end to trim off a the incomplete character
138+
toValidate := content
139+
end := len(toValidate) - 1
140+
141+
if end < 0 {
142+
// no-op
143+
} else if toValidate[end]>>5 == 0b110 {
144+
// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
145+
toValidate = toValidate[:end]
146+
} else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {
147+
// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
148+
toValidate = toValidate[:end-1]
149+
} else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {
150+
// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
151+
toValidate = toValidate[:end-2]
152+
}
153+
if utf8.Valid(toValidate) {
135154
log.Debug("Detected encoding: utf-8 (fast)")
136155
return "UTF-8", nil
137156
}

0 commit comments

Comments
 (0)