Skip to content

Commit 4ce32c9

Browse files
authored
Detect encoding changes while parsing diff (#16330)
* Detect encoding changes while parsing diff
1 parent 2614309 commit 4ce32c9

File tree

1 file changed

+31
-19
lines changed

1 file changed

+31
-19
lines changed

services/gitdiff/gitdiff.go

+31-19
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232

3333
"github.com/sergi/go-diff/diffmatchpatch"
3434
stdcharset "golang.org/x/net/html/charset"
35+
"golang.org/x/text/encoding"
3536
"golang.org/x/text/transform"
3637
)
3738

@@ -883,35 +884,46 @@ parsingLoop:
883884

884885
}
885886

886-
// FIXME: There are numerous issues with this:
887+
// TODO: There are numerous issues with this:
887888
// - we might want to consider detecting encoding while parsing but...
888889
// - we're likely to fail to get the correct encoding here anyway as we won't have enough information
889-
// - and this doesn't really account for changes in encoding
890-
var buf bytes.Buffer
890+
var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)
891+
var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3)
892+
diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer)
893+
diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer)
894+
diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer)
891895
for _, f := range diff.Files {
892-
buf.Reset()
896+
for _, buffer := range diffLineTypeBuffers {
897+
buffer.Reset()
898+
}
893899
for _, sec := range f.Sections {
894900
for _, l := range sec.Lines {
895901
if l.Type == DiffLineSection {
896902
continue
897903
}
898-
buf.WriteString(l.Content[1:])
899-
buf.WriteString("\n")
904+
diffLineTypeBuffers[l.Type].WriteString(l.Content[1:])
905+
diffLineTypeBuffers[l.Type].WriteString("\n")
900906
}
901907
}
902-
charsetLabel, err := charset.DetectEncoding(buf.Bytes())
903-
if charsetLabel != "UTF-8" && err == nil {
904-
encoding, _ := stdcharset.Lookup(charsetLabel)
905-
if encoding != nil {
906-
d := encoding.NewDecoder()
907-
for _, sec := range f.Sections {
908-
for _, l := range sec.Lines {
909-
if l.Type == DiffLineSection {
910-
continue
911-
}
912-
if c, _, err := transform.String(d, l.Content[1:]); err == nil {
913-
l.Content = l.Content[0:1] + c
914-
}
908+
for lineType, buffer := range diffLineTypeBuffers {
909+
diffLineTypeDecoders[lineType] = nil
910+
if buffer.Len() == 0 {
911+
continue
912+
}
913+
charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
914+
if charsetLabel != "UTF-8" && err == nil {
915+
encoding, _ := stdcharset.Lookup(charsetLabel)
916+
if encoding != nil {
917+
diffLineTypeDecoders[lineType] = encoding.NewDecoder()
918+
}
919+
}
920+
}
921+
for _, sec := range f.Sections {
922+
for _, l := range sec.Lines {
923+
decoder := diffLineTypeDecoders[l.Type]
924+
if decoder != nil {
925+
if c, _, err := transform.String(decoder, l.Content[1:]); err == nil {
926+
l.Content = l.Content[0:1] + c
915927
}
916928
}
917929
}

0 commit comments

Comments
 (0)