Skip to content

Commit 21fb791

Browse files
zeripathtechknowlogick
authored andcommitted
Detect encoding and BOM in content (#6727) (#6765)
Detect and remove a decoded BOM when showing content. Restore the previous encoding and BOM when updating content. On error keep as UTF-8 encoding. Signed-off-by: Andrew Thornton <[email protected]>
1 parent 8b3aad9 commit 21fb791

File tree

3 files changed

+127
-7
lines changed

3 files changed

+127
-7
lines changed

modules/base/tool.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package base
66

77
import (
8+
"bytes"
89
"crypto/md5"
910
"crypto/rand"
1011
"crypto/sha1"
@@ -32,6 +33,9 @@ import (
3233
"github.com/gogits/chardet"
3334
)
3435

36+
// UTF8BOM is the utf-8 byte-order marker
37+
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
38+
3539
// EncodeMD5 encodes string to md5 hex value.
3640
func EncodeMD5(str string) string {
3741
m := md5.New()
@@ -87,6 +91,14 @@ func DetectEncoding(content []byte) (string, error) {
8791
return result.Charset, err
8892
}
8993

94+
// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
95+
func RemoveBOMIfPresent(content []byte) []byte {
96+
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
97+
return content[3:]
98+
}
99+
return content
100+
}
101+
90102
// BasicAuthDecode decode basic auth string
91103
func BasicAuthDecode(encoded string) (string, string, error) {
92104
s, err := base64.StdEncoding.DecodeString(encoded)

modules/templates/helper.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
267267
if err != nil {
268268
return "", err
269269
} else if charsetLabel == "UTF-8" {
270-
return string(content), nil
270+
return string(base.RemoveBOMIfPresent(content)), nil
271271
}
272272

273273
encoding, _ := charset.Lookup(charsetLabel)
@@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
277277

278278
// If there is an error, we concatenate the nicely decoded part and the
279279
// original left over. This way we won't lose data.
280-
result, n, err := transform.String(encoding.NewDecoder(), string(content))
280+
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
281281
if err != nil {
282-
result = result + string(content[n:])
282+
result = append(result, content[n:]...)
283283
}
284284

285-
return result, err
285+
result = base.RemoveBOMIfPresent(result)
286+
287+
return string(result), err
286288
}
287289

288290
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
289291
func ToUTF8WithFallback(content []byte) []byte {
290292
charsetLabel, err := base.DetectEncoding(content)
291293
if err != nil || charsetLabel == "UTF-8" {
292-
return content
294+
return base.RemoveBOMIfPresent(content)
293295
}
294296

295297
encoding, _ := charset.Lookup(charsetLabel)
@@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
304306
return append(result, content[n:]...)
305307
}
306308

307-
return result
309+
return base.RemoveBOMIfPresent(result)
308310
}
309311

310312
// ToUTF8 converts content to UTF8 encoding and ignore error

modules/uploader/update.go

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,85 @@
55
package uploader
66

77
import (
8+
"bytes"
89
"fmt"
910
"strings"
1011

12+
"golang.org/x/net/html/charset"
13+
"golang.org/x/text/transform"
14+
1115
"code.gitea.io/git"
1216
"code.gitea.io/gitea/models"
17+
"code.gitea.io/gitea/modules/base"
1318
"code.gitea.io/gitea/modules/lfs"
19+
"code.gitea.io/gitea/modules/log"
1420
"code.gitea.io/gitea/modules/setting"
1521
)
1622

23+
func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
24+
reader, err := entry.Blob().DataAsync()
25+
if err != nil {
26+
// return default
27+
return "UTF-8", false
28+
}
29+
defer reader.Close()
30+
buf := make([]byte, 1024)
31+
n, err := reader.Read(buf)
32+
if err != nil {
33+
// return default
34+
return "UTF-8", false
35+
}
36+
buf = buf[:n]
37+
38+
if setting.LFS.StartServer {
39+
meta := lfs.IsPointerFile(&buf)
40+
if meta != nil {
41+
meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
42+
if err != nil && err != models.ErrLFSObjectNotExist {
43+
// return default
44+
return "UTF-8", false
45+
}
46+
}
47+
if meta != nil {
48+
dataRc, err := lfs.ReadMetaObject(meta)
49+
if err != nil {
50+
// return default
51+
return "UTF-8", false
52+
}
53+
defer dataRc.Close()
54+
buf = make([]byte, 1024)
55+
n, err = dataRc.Read(buf)
56+
if err != nil {
57+
// return default
58+
return "UTF-8", false
59+
}
60+
buf = buf[:n]
61+
}
62+
63+
}
64+
65+
encoding, err := base.DetectEncoding(buf)
66+
if err != nil {
67+
// just default to utf-8 and no bom
68+
return "UTF-8", false
69+
}
70+
if encoding == "UTF-8" {
71+
return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
72+
}
73+
charsetEncoding, _ := charset.Lookup(encoding)
74+
if charsetEncoding == nil {
75+
return "UTF-8", false
76+
}
77+
78+
result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
79+
80+
if n > 2 {
81+
return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
82+
}
83+
84+
return encoding, false
85+
}
86+
1787
// UpdateRepoFileOptions holds the repository file update options
1888
type UpdateRepoFileOptions struct {
1989
LastCommitID string
@@ -45,12 +115,29 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
45115
return fmt.Errorf("UpdateRepoFile: %v", err)
46116
}
47117

118+
encoding := "UTF-8"
119+
bom := false
120+
48121
if opts.IsNewFile {
49122
for _, file := range filesInIndex {
50123
if file == opts.NewTreeName {
51124
return models.ErrRepoFileAlreadyExist{FileName: opts.NewTreeName}
52125
}
53126
}
127+
} else {
128+
gitRepo, err := git.OpenRepository(t.basePath)
129+
if err != nil {
130+
return err
131+
}
132+
tree, err := gitRepo.GetTree("HEAD")
133+
if err != nil {
134+
return err
135+
}
136+
entry, err := tree.GetTreeEntryByPath(opts.OldTreeName)
137+
if err != nil {
138+
return err
139+
}
140+
encoding, bom = detectEncodingAndBOM(entry, repo)
54141
}
55142

56143
//var stdout string
@@ -72,9 +159,28 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
72159
}
73160

74161
content := opts.Content
162+
if bom {
163+
content = string(base.UTF8BOM) + content
164+
}
165+
if encoding != "UTF-8" {
166+
charsetEncoding, _ := charset.Lookup(encoding)
167+
if charsetEncoding != nil {
168+
result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
169+
if err != nil {
170+
// Look if we can't encode back in to the original we should just stick with utf-8
171+
log.Error(4, "Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.NewTreeName, opts.OldTreeName, encoding, err)
172+
result = content
173+
}
174+
content = result
175+
} else {
176+
log.Error(4, "Unknown encoding: %s", encoding)
177+
}
178+
}
179+
// Reset the opts.Content with the re-encoded and BOM'd content
180+
opts.Content = content
75181
var lfsMetaObject *models.LFSMetaObject
76182

77-
if filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
183+
if setting.LFS.StartServer && filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
78184
// OK so we are supposed to LFS this data!
79185
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
80186
if err != nil {

0 commit comments

Comments
 (0)