Skip to content

Commit f6eedd4

Browse files
zeripathlafriks
authored andcommitted
UI: Detect and restore encoding and BOM in content (#6727)
* detect and remove a decoded BOM Signed-off-by: Andrew Thornton <[email protected]> * Restore the previous encoding and BOM * On error keep as UTF-8 Signed-off-by: Andrew Thornton <[email protected]> * create remove BOM function * Deal with LFSed content * Update modules/repofiles/update.go * Fix final LFS bug * Keep LFS sections referring to opts.Content
1 parent 4c34bc1 commit f6eedd4

File tree

3 files changed

+114
-7
lines changed

3 files changed

+114
-7
lines changed

modules/base/tool.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package base
66

77
import (
8+
"bytes"
89
"crypto/md5"
910
"crypto/rand"
1011
"crypto/sha1"
@@ -36,6 +37,9 @@ import (
3637
"github.com/gogits/chardet"
3738
)
3839

40+
// UTF8BOM is the utf-8 byte-order marker
41+
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
42+
3943
// EncodeMD5 encodes string to md5 hex value.
4044
func EncodeMD5(str string) string {
4145
m := md5.New()
@@ -91,6 +95,14 @@ func DetectEncoding(content []byte) (string, error) {
9195
return result.Charset, err
9296
}
9397

98+
// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
99+
func RemoveBOMIfPresent(content []byte) []byte {
100+
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
101+
return content[3:]
102+
}
103+
return content
104+
}
105+
94106
// BasicAuthDecode decode basic auth string
95107
func BasicAuthDecode(encoded string) (string, string, error) {
96108
s, err := base64.StdEncoding.DecodeString(encoded)

modules/repofiles/update.go

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,19 @@
55
package repofiles
66

77
import (
8+
"bytes"
89
"fmt"
910
"path"
1011
"strings"
1112

13+
"golang.org/x/net/html/charset"
14+
"golang.org/x/text/transform"
15+
1216
"code.gitea.io/gitea/models"
17+
"code.gitea.io/gitea/modules/base"
1318
"code.gitea.io/gitea/modules/git"
1419
"code.gitea.io/gitea/modules/lfs"
20+
"code.gitea.io/gitea/modules/log"
1521
"code.gitea.io/gitea/modules/setting"
1622
"code.gitea.io/sdk/gitea"
1723
)
@@ -37,6 +43,70 @@ type UpdateRepoFileOptions struct {
3743
Committer *IdentityOptions
3844
}
3945

46+
func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
47+
reader, err := entry.Blob().DataAsync()
48+
if err != nil {
49+
// return default
50+
return "UTF-8", false
51+
}
52+
defer reader.Close()
53+
buf := make([]byte, 1024)
54+
n, err := reader.Read(buf)
55+
if err != nil {
56+
// return default
57+
return "UTF-8", false
58+
}
59+
buf = buf[:n]
60+
61+
if setting.LFS.StartServer {
62+
meta := lfs.IsPointerFile(&buf)
63+
if meta != nil {
64+
meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
65+
if err != nil && err != models.ErrLFSObjectNotExist {
66+
// return default
67+
return "UTF-8", false
68+
}
69+
}
70+
if meta != nil {
71+
dataRc, err := lfs.ReadMetaObject(meta)
72+
if err != nil {
73+
// return default
74+
return "UTF-8", false
75+
}
76+
defer dataRc.Close()
77+
buf = make([]byte, 1024)
78+
n, err = dataRc.Read(buf)
79+
if err != nil {
80+
// return default
81+
return "UTF-8", false
82+
}
83+
buf = buf[:n]
84+
}
85+
86+
}
87+
88+
encoding, err := base.DetectEncoding(buf)
89+
if err != nil {
90+
// just default to utf-8 and no bom
91+
return "UTF-8", false
92+
}
93+
if encoding == "UTF-8" {
94+
return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
95+
}
96+
charsetEncoding, _ := charset.Lookup(encoding)
97+
if charsetEncoding == nil {
98+
return "UTF-8", false
99+
}
100+
101+
result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
102+
103+
if n > 2 {
104+
return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
105+
}
106+
107+
return encoding, false
108+
}
109+
40110
// CreateOrUpdateRepoFile adds or updates a file in the given repository
41111
func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) {
42112
// If no branch name is set, assume master
@@ -118,6 +188,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
118188
opts.LastCommitID = commit.ID.String()
119189
}
120190

191+
encoding := "UTF-8"
192+
bom := false
193+
121194
if !opts.IsNewFile {
122195
fromEntry, err := commit.GetTreeEntryByPath(fromTreePath)
123196
if err != nil {
@@ -151,6 +224,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
151224
// haven't been made. We throw an error if one wasn't provided.
152225
return nil, models.ErrSHAOrCommitIDNotProvided{}
153226
}
227+
encoding, bom = detectEncodingAndBOM(fromEntry, repo)
154228
}
155229

156230
// For the path where this file will be created/updated, we need to make
@@ -235,9 +309,28 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
235309
}
236310

237311
content := opts.Content
312+
if bom {
313+
content = string(base.UTF8BOM) + content
314+
}
315+
if encoding != "UTF-8" {
316+
charsetEncoding, _ := charset.Lookup(encoding)
317+
if charsetEncoding != nil {
318+
result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
319+
if err != nil {
320+
// Look if we can't encode back in to the original we should just stick with utf-8
321+
log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err)
322+
result = content
323+
}
324+
content = result
325+
} else {
326+
log.Error("Unknown encoding: %s", encoding)
327+
}
328+
}
329+
// Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content
330+
opts.Content = content
238331
var lfsMetaObject *models.LFSMetaObject
239332

240-
if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
333+
if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
241334
// OK so we are supposed to LFS this data!
242335
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
243336
if err != nil {

modules/templates/helper.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
267267
if err != nil {
268268
return "", err
269269
} else if charsetLabel == "UTF-8" {
270-
return string(content), nil
270+
return string(base.RemoveBOMIfPresent(content)), nil
271271
}
272272

273273
encoding, _ := charset.Lookup(charsetLabel)
@@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
277277

278278
// If there is an error, we concatenate the nicely decoded part and the
279279
// original left over. This way we won't lose data.
280-
result, n, err := transform.String(encoding.NewDecoder(), string(content))
280+
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
281281
if err != nil {
282-
result = result + string(content[n:])
282+
result = append(result, content[n:]...)
283283
}
284284

285-
return result, err
285+
result = base.RemoveBOMIfPresent(result)
286+
287+
return string(result), err
286288
}
287289

288290
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
289291
func ToUTF8WithFallback(content []byte) []byte {
290292
charsetLabel, err := base.DetectEncoding(content)
291293
if err != nil || charsetLabel == "UTF-8" {
292-
return content
294+
return base.RemoveBOMIfPresent(content)
293295
}
294296

295297
encoding, _ := charset.Lookup(charsetLabel)
@@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
304306
return append(result, content[n:]...)
305307
}
306308

307-
return result
309+
return base.RemoveBOMIfPresent(result)
308310
}
309311

310312
// ToUTF8 converts content to UTF8 encoding and ignore error

0 commit comments

Comments
 (0)