Skip to content

Commit 0c61376

Browse files
authored
Add Tabular Diff for CSV files (#14661)
Implements request #14320 The rendering of CSV files does match the diff style. * Moved CSV logic into base package. * Added method to create a tabular diff. * Added CSV compare context. * Added CSV diff template. * Use new table style in CSV markup. * Added file size limit for CSV rendering. * Display CSV parser errors in diff. * Lazy read single file. * Lazy read rows for full diff. * Added unit tests for various CSV changes.
1 parent d3b8127 commit 0c61376

File tree

20 files changed

+937
-118
lines changed

20 files changed

+937
-118
lines changed

custom/conf/app.example.ini

+4
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ EVENT_SOURCE_UPDATE_TIME = 10s
248248
; Whether to render SVG files as images. If SVG rendering is disabled, SVG files are displayed as text and cannot be embedded in markdown files as images.
249249
ENABLE_RENDER = true
250250

251+
[ui.csv]
252+
; Maximum allowed file size in bytes to render CSV files as table. (Set to 0 for no limit).
253+
MAX_FILE_SIZE = 524288
254+
251255
[markdown]
252256
; Render soft line breaks as hard line breaks, which means a single newline character between
253257
; paragraphs will cause a line break and adding trailing whitespace to paragraphs is not

docs/content/doc/advanced/config-cheat-sheet.en-us.md

+4
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
198198

199199
- `ENABLE_RENDER`: **true**: Whether to render SVG files as images. If SVG rendering is disabled, SVG files are displayed as text and cannot be embedded in markdown files as images.
200200

201+
### UI - CSV Files (`ui.csv`)
202+
203+
- `MAX_FILE_SIZE`: **524288** (512kb): Maximum allowed file size in bytes to render CSV files as table. (Set to 0 for no limit).
204+
201205
## Markdown (`markdown`)
202206

203207
- `ENABLE_HARD_LINE_BREAK_IN_COMMENTS`: **true**: Render soft line breaks as hard line breaks in comments, which

modules/csv/csv.go

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Copyright 2021 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package csv
6+
7+
import (
8+
"bytes"
9+
"encoding/csv"
10+
"errors"
11+
"regexp"
12+
"strings"
13+
14+
"code.gitea.io/gitea/modules/translation"
15+
"code.gitea.io/gitea/modules/util"
16+
)
17+
18+
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
19+
20+
// CreateReader creates a csv.Reader with the given delimiter.
21+
func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader {
22+
rd := csv.NewReader(bytes.NewReader(rawBytes))
23+
rd.Comma = delimiter
24+
rd.TrimLeadingSpace = true
25+
return rd
26+
}
27+
28+
// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
29+
func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader {
30+
delimiter := guessDelimiter(rawBytes)
31+
return CreateReader(rawBytes, delimiter)
32+
}
33+
34+
// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
35+
// Reads at most 10k bytes & 10 lines.
36+
func guessDelimiter(data []byte) rune {
37+
maxLines := 10
38+
maxBytes := util.Min(len(data), 1e4)
39+
text := string(data[:maxBytes])
40+
text = quoteRegexp.ReplaceAllLiteralString(text, "")
41+
lines := strings.SplitN(text, "\n", maxLines+1)
42+
lines = lines[:util.Min(maxLines, len(lines))]
43+
44+
delimiters := []rune{',', ';', '\t', '|', '@'}
45+
bestDelim := delimiters[0]
46+
bestScore := 0.0
47+
for _, delim := range delimiters {
48+
score := scoreDelimiter(lines, delim)
49+
if score > bestScore {
50+
bestScore = score
51+
bestDelim = delim
52+
}
53+
}
54+
55+
return bestDelim
56+
}
57+
58+
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
59+
func scoreDelimiter(lines []string, delim rune) float64 {
60+
countTotal := 0
61+
countLineMax := 0
62+
linesNotEqual := 0
63+
64+
for _, line := range lines {
65+
if len(line) == 0 {
66+
continue
67+
}
68+
69+
countLine := strings.Count(line, string(delim))
70+
countTotal += countLine
71+
if countLine != countLineMax {
72+
if countLineMax != 0 {
73+
linesNotEqual++
74+
}
75+
countLineMax = util.Max(countLine, countLineMax)
76+
}
77+
}
78+
79+
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
80+
}
81+
82+
// FormatError converts csv errors into readable messages.
83+
func FormatError(err error, locale translation.Locale) (string, error) {
84+
var perr *csv.ParseError
85+
if errors.As(err, &perr) {
86+
if perr.Err == csv.ErrFieldCount {
87+
return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
88+
}
89+
return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil
90+
}
91+
92+
return "", err
93+
}

modules/csv/csv_test.go

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright 2021 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package csv
6+
7+
import (
8+
"testing"
9+
10+
"github.com/stretchr/testify/assert"
11+
)
12+
13+
func TestCreateReader(t *testing.T) {
14+
rd := CreateReader([]byte{}, ',')
15+
assert.Equal(t, ',', rd.Comma)
16+
}
17+
18+
func TestCreateReaderAndGuessDelimiter(t *testing.T) {
19+
input := "a;b;c\n1;2;3\n4;5;6"
20+
21+
rd := CreateReaderAndGuessDelimiter([]byte(input))
22+
assert.Equal(t, ';', rd.Comma)
23+
}
24+
25+
func TestGuessDelimiter(t *testing.T) {
26+
var kases = map[string]rune{
27+
"a": ',',
28+
"1,2": ',',
29+
"1;2": ';',
30+
"1\t2": '\t',
31+
"1|2": '|',
32+
"1,2,3;4,5,6;7,8,9\na;b;c": ';',
33+
"\"1,2,3,4\";\"a\nb\"\nc;d": ';',
34+
"<br/>": ',',
35+
}
36+
37+
for k, v := range kases {
38+
assert.EqualValues(t, guessDelimiter([]byte(k)), v)
39+
}
40+
}

modules/markup/csv/csv.go

+40-63
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,20 @@ package markup
66

77
import (
88
"bytes"
9-
"encoding/csv"
109
"html"
1110
"io"
12-
"regexp"
13-
"strings"
11+
"strconv"
1412

13+
"code.gitea.io/gitea/modules/csv"
1514
"code.gitea.io/gitea/modules/markup"
16-
"code.gitea.io/gitea/modules/util"
15+
"code.gitea.io/gitea/modules/setting"
1716
)
1817

19-
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
20-
2118
func init() {
2219
markup.RegisterParser(Parser{})
23-
2420
}
2521

26-
// Parser implements markup.Parser for orgmode
22+
// Parser implements markup.Parser for csv files
2723
type Parser struct {
2824
}
2925

@@ -38,11 +34,35 @@ func (Parser) Extensions() []string {
3834
}
3935

4036
// Render implements markup.Parser
41-
func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
42-
rd := csv.NewReader(bytes.NewReader(rawBytes))
43-
rd.Comma = p.bestDelimiter(rawBytes)
37+
func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
4438
var tmpBlock bytes.Buffer
45-
tmpBlock.WriteString(`<table class="table">`)
39+
40+
if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) {
41+
tmpBlock.WriteString("<pre>")
42+
tmpBlock.WriteString(html.EscapeString(string(rawBytes)))
43+
tmpBlock.WriteString("</pre>")
44+
return tmpBlock.Bytes()
45+
}
46+
47+
rd := csv.CreateReaderAndGuessDelimiter(rawBytes)
48+
49+
writeField := func(element, class, field string) {
50+
tmpBlock.WriteString("<")
51+
tmpBlock.WriteString(element)
52+
if len(class) > 0 {
53+
tmpBlock.WriteString(" class=\"")
54+
tmpBlock.WriteString(class)
55+
tmpBlock.WriteString("\"")
56+
}
57+
tmpBlock.WriteString(">")
58+
tmpBlock.WriteString(html.EscapeString(field))
59+
tmpBlock.WriteString("</")
60+
tmpBlock.WriteString(element)
61+
tmpBlock.WriteString(">")
62+
}
63+
64+
tmpBlock.WriteString(`<table class="data-table">`)
65+
row := 1
4666
for {
4767
fields, err := rd.Read()
4868
if err == io.EOF {
@@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
5272
continue
5373
}
5474
tmpBlock.WriteString("<tr>")
75+
element := "td"
76+
if row == 1 {
77+
element = "th"
78+
}
79+
writeField(element, "line-num", strconv.Itoa(row))
5580
for _, field := range fields {
56-
tmpBlock.WriteString("<td>")
57-
tmpBlock.WriteString(html.EscapeString(field))
58-
tmpBlock.WriteString("</td>")
81+
writeField(element, "", field)
5982
}
6083
tmpBlock.WriteString("</tr>")
84+
85+
row++
6186
}
6287
tmpBlock.WriteString("</table>")
6388

6489
return tmpBlock.Bytes()
6590
}
66-
67-
// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
68-
// Reads at most 10k bytes & 10 lines.
69-
func (p Parser) bestDelimiter(data []byte) rune {
70-
maxLines := 10
71-
maxBytes := util.Min(len(data), 1e4)
72-
text := string(data[:maxBytes])
73-
text = quoteRegexp.ReplaceAllLiteralString(text, "")
74-
lines := strings.SplitN(text, "\n", maxLines+1)
75-
lines = lines[:util.Min(maxLines, len(lines))]
76-
77-
delimiters := []rune{',', ';', '\t', '|'}
78-
bestDelim := delimiters[0]
79-
bestScore := 0.0
80-
for _, delim := range delimiters {
81-
score := p.scoreDelimiter(lines, delim)
82-
if score > bestScore {
83-
bestScore = score
84-
bestDelim = delim
85-
}
86-
}
87-
88-
return bestDelim
89-
}
90-
91-
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
92-
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
93-
countTotal := 0
94-
countLineMax := 0
95-
linesNotEqual := 0
96-
97-
for _, line := range lines {
98-
if len(line) == 0 {
99-
continue
100-
}
101-
102-
countLine := strings.Count(line, string(delim))
103-
countTotal += countLine
104-
if countLine != countLineMax {
105-
if countLineMax != 0 {
106-
linesNotEqual++
107-
}
108-
countLineMax = util.Max(countLine, countLineMax)
109-
}
110-
}
111-
112-
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
113-
}

modules/markup/csv/csv_test.go

+4-8
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,10 @@ import (
1313
func TestRenderCSV(t *testing.T) {
1414
var parser Parser
1515
var kases = map[string]string{
16-
"a": "<table class=\"table\"><tr><td>a</td></tr></table>",
17-
"1,2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
18-
"1;2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
19-
"1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
20-
"1|2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
21-
"1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>",
22-
"\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>",
23-
"<br/>": "<table class=\"table\"><tr><td>&lt;br/&gt;</td></tr></table>",
16+
"a": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>a</th></tr></table>",
17+
"1,2": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr></table>",
18+
"1;2\n3;4": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr><tr><td class=\"line-num\">2</td><td>3</td><td>4</td></tr></table>",
19+
"<br/>": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>&lt;br/&gt;</th></tr></table>",
2420
}
2521

2622
for k, v := range kases {

modules/markup/sanitizer.go

+4
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ func ReplaceSanitizer() {
6969
// Allow icons, emojis, and chroma syntax on span
7070
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$`)).OnElements("span")
7171

72+
// Allow data tables
73+
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table")
74+
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td")
75+
7276
// Allow generally safe attributes
7377
generalSafeAttrs := []string{"abbr", "accept", "accept-charset",
7478
"accesskey", "action", "align", "alt",

modules/setting/setting.go

+9
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,10 @@ var (
213213
Enabled bool `ini:"ENABLE_RENDER"`
214214
} `ini:"ui.svg"`
215215

216+
CSV struct {
217+
MaxFileSize int64
218+
} `ini:"ui.csv"`
219+
216220
Admin struct {
217221
UserPagingNum int
218222
RepoPagingNum int
@@ -258,6 +262,11 @@ var (
258262
}{
259263
Enabled: true,
260264
},
265+
CSV: struct {
266+
MaxFileSize int64
267+
}{
268+
MaxFileSize: 524288,
269+
},
261270
Admin: struct {
262271
UserPagingNum int
263272
RepoPagingNum int

options/locale/locale_en-US.ini

+5
Original file line numberDiff line numberDiff line change
@@ -1860,6 +1860,7 @@ diff.whitespace_ignore_at_eol = Ignore changes in whitespace at EOL
18601860
diff.stats_desc = <strong> %d changed files</strong> with <strong>%d additions</strong> and <strong>%d deletions</strong>
18611861
diff.stats_desc_file = %d changes: %d additions and %d deletions
18621862
diff.bin = BIN
1863+
diff.bin_not_shown = Binary file not shown.
18631864
diff.view_file = View File
18641865
diff.file_before = Before
18651866
diff.file_after = After
@@ -1960,6 +1961,10 @@ topic.done = Done
19601961
topic.count_prompt = You can not select more than 25 topics
19611962
topic.format_prompt = Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
19621963

1964+
error.csv.too_large = Can't render this file because it is too large.
1965+
error.csv.unexpected = Can't render this file because it contains an unexpected character in line %d and column %d.
1966+
error.csv.invalid_field_count = Can't render this file because it has a wrong number of fields in line %d.
1967+
19631968
[org]
19641969
org_name_holder = Organization Name
19651970
org_full_name_holder = Organization Full Name

routers/repo/commit.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,8 @@ func Diff(ctx *context.Context) {
336336
return
337337
}
338338
}
339-
setImageCompareContext(ctx, parentCommit, commit)
340339
headTarget := path.Join(userName, repoName)
341-
setPathsCompareContext(ctx, parentCommit, commit, headTarget)
340+
setCompareContext(ctx, parentCommit, commit, headTarget)
342341
ctx.Data["Title"] = commit.Summary() + " · " + base.ShortSha(commitID)
343342
ctx.Data["Commit"] = commit
344343
verification := models.ParseCommitWithSignature(commit)

0 commit comments

Comments
 (0)