Skip to content

Commit e28cc79

Browse files
Improve sync performance for pull-mirrors (#19125)
This addresses #18352 It aims to improve performance (and resource use) of the `SyncReleasesWithTags` operation for pull-mirrors. For large repositories with many tags, `SyncReleasesWithTags` can be a costly operation (taking several minutes to complete). The reason is two-fold: 1. on sync, every upstream repo tag is compared (for changes) against existing local entries in the release table to ensure that they are up-to-date. 2. the procedure for getting _each tag_ involves a series of git operations ```bash git show-ref --tags -- v8.2.4477 git cat-file -t 29ab6ce9f36660cffaad3c8789e71162e5db5d2f git cat-file -p 29ab6ce9f36660cffaad3c8789e71162e5db5d2f git rev-list --count 29ab6ce9f36660cffaad3c8789e71162e5db5d2f ``` of which the `git rev-list --count` can be particularly heavy. This PR optimizes performance for pull-mirrors. We utilize the fact that a pull-mirror is always identical to its upstream and rebuild the entire release table on every sync and use a batch `git for-each-ref .. refs/tags` call to retrieve all tags in one go. For large mirror repos, with hundreds of annotated tags, this brings down the duration of the sync operation from several minutes to a few seconds. A few unscientific examples run on my local machine: - https://github.com/spring-projects/spring-boot (223 tags) - before: `0m28,673s` - after: `0m2,244s` - https://github.com/kubernetes/kubernetes (890 tags) - before: `8m00s` - after: `0m8,520s` - https://github.com/vim/vim (13954 tags) - before: `14m20,383s` - after: `0m35,467s` I added a `foreachref` package which contains a flexible way of specifying which reference fields are of interest (`git-for-each-ref(1)`) and to produce a parser for the expected output. These could be reused in other places where `for-each-ref` is used. I'll add unit tests for those if the overall PR looks promising.
1 parent b877504 commit e28cc79

File tree

7 files changed

+834
-19
lines changed

7 files changed

+834
-19
lines changed

modules/git/foreachref/format.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package foreachref
6+
7+
import (
8+
"encoding/hex"
9+
"fmt"
10+
"io"
11+
"strings"
12+
)
13+
14+
var (
15+
nullChar = []byte("\x00")
16+
dualNullChar = []byte("\x00\x00")
17+
)
18+
19+
// Format supports specifying and parsing an output format for 'git
20+
// for-each-ref'. See See git-for-each-ref(1) for available fields.
21+
type Format struct {
22+
// fieldNames hold %(fieldname)s to be passed to the '--format' flag of
23+
// for-each-ref. See git-for-each-ref(1) for available fields.
24+
fieldNames []string
25+
26+
// fieldDelim is the character sequence that is used to separate fields
27+
// for each reference. fieldDelim and refDelim should be selected to not
28+
// interfere with each other and to not be present in field values.
29+
fieldDelim []byte
30+
// fieldDelimStr is a string representation of fieldDelim. Used to save
31+
// us from repetitive reallocation whenever we need the delimiter as a
32+
// string.
33+
fieldDelimStr string
34+
// refDelim is the character sequence used to separate reference from
35+
// each other in the output. fieldDelim and refDelim should be selected
36+
// to not interfere with each other and to not be present in field
37+
// values.
38+
refDelim []byte
39+
}
40+
41+
// NewFormat creates a forEachRefFormat using the specified fieldNames. See
42+
// git-for-each-ref(1) for available fields.
43+
func NewFormat(fieldNames ...string) Format {
44+
return Format{
45+
fieldNames: fieldNames,
46+
fieldDelim: nullChar,
47+
fieldDelimStr: string(nullChar),
48+
refDelim: dualNullChar,
49+
}
50+
}
51+
52+
// Flag returns a for-each-ref --format flag value that captures the fieldNames.
53+
func (f Format) Flag() string {
54+
var formatFlag strings.Builder
55+
for i, field := range f.fieldNames {
56+
// field key and field value
57+
formatFlag.WriteString(fmt.Sprintf("%s %%(%s)", field, field))
58+
59+
if i < len(f.fieldNames)-1 {
60+
// note: escape delimiters to allow control characters as
61+
// delimiters. For example, '%00' for null character or '%0a'
62+
// for newline.
63+
formatFlag.WriteString(f.hexEscaped(f.fieldDelim))
64+
}
65+
}
66+
formatFlag.WriteString(f.hexEscaped(f.refDelim))
67+
return formatFlag.String()
68+
}
69+
70+
// Parser returns a Parser capable of parsing 'git for-each-ref' output produced
71+
// with this Format.
72+
func (f Format) Parser(r io.Reader) *Parser {
73+
return NewParser(r, f)
74+
}
75+
76+
// hexEscaped produces hex-escpaed characters from a string. For example, "\n\0"
77+
// would turn into "%0a%00".
78+
func (f Format) hexEscaped(delim []byte) string {
79+
escaped := ""
80+
for i := 0; i < len(delim); i++ {
81+
escaped += "%" + hex.EncodeToString([]byte{delim[i]})
82+
}
83+
return escaped
84+
}

modules/git/foreachref/format_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package foreachref_test
6+
7+
import (
8+
"testing"
9+
10+
"code.gitea.io/gitea/modules/git/foreachref"
11+
12+
"github.com/stretchr/testify/require"
13+
)
14+
15+
func TestFormat_Flag(t *testing.T) {
16+
tests := []struct {
17+
name string
18+
19+
givenFormat foreachref.Format
20+
21+
wantFlag string
22+
}{
23+
{
24+
name: "references are delimited by dual null chars",
25+
26+
// no reference fields requested
27+
givenFormat: foreachref.NewFormat(),
28+
29+
// only a reference delimiter field in --format
30+
wantFlag: "%00%00",
31+
},
32+
33+
{
34+
name: "a field is a space-separated key-value pair",
35+
36+
givenFormat: foreachref.NewFormat("refname:short"),
37+
38+
// only a reference delimiter field
39+
wantFlag: "refname:short %(refname:short)%00%00",
40+
},
41+
42+
{
43+
name: "fields are separated by a null char field-delimiter",
44+
45+
givenFormat: foreachref.NewFormat("refname:short", "author"),
46+
47+
wantFlag: "refname:short %(refname:short)%00author %(author)%00%00",
48+
},
49+
50+
{
51+
name: "multiple fields",
52+
53+
givenFormat: foreachref.NewFormat("refname:short", "objecttype", "objectname"),
54+
55+
wantFlag: "refname:short %(refname:short)%00objecttype %(objecttype)%00objectname %(objectname)%00%00",
56+
},
57+
}
58+
59+
for _, test := range tests {
60+
tc := test // don't close over loop variable
61+
t.Run(tc.name, func(t *testing.T) {
62+
gotFlag := tc.givenFormat.Flag()
63+
64+
require.Equal(t, tc.wantFlag, gotFlag, "unexpected for-each-ref --format string. wanted: '%s', got: '%s'", tc.wantFlag, gotFlag)
65+
})
66+
}
67+
}

modules/git/foreachref/parser.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package foreachref
6+
7+
import (
8+
"bufio"
9+
"bytes"
10+
"fmt"
11+
"io"
12+
"strings"
13+
)
14+
15+
// Parser parses 'git for-each-ref' output according to a given output Format.
16+
type Parser struct {
17+
// tokenizes 'git for-each-ref' output into "reference paragraphs".
18+
scanner *bufio.Scanner
19+
20+
// format represents the '--format' string that describes the expected
21+
// 'git for-each-ref' output structure.
22+
format Format
23+
24+
// err holds the last encountered error during parsing.
25+
err error
26+
}
27+
28+
// NewParser creates a 'git for-each-ref' output parser that will parse all
29+
// references in the provided Reader. The references in the output are assumed
30+
// to follow the specified Format.
31+
func NewParser(r io.Reader, format Format) *Parser {
32+
scanner := bufio.NewScanner(r)
33+
34+
// in addition to the reference delimiter we specified in the --format,
35+
// `git for-each-ref` will always add a newline after every reference.
36+
refDelim := make([]byte, 0, len(format.refDelim)+1)
37+
refDelim = append(refDelim, format.refDelim...)
38+
refDelim = append(refDelim, '\n')
39+
40+
// Split input into delimiter-separated "reference blocks".
41+
scanner.Split(
42+
func(data []byte, atEOF bool) (advance int, token []byte, err error) {
43+
// Scan until delimiter, marking end of reference.
44+
delimIdx := bytes.Index(data, refDelim)
45+
if delimIdx >= 0 {
46+
token := data[:delimIdx]
47+
advance := delimIdx + len(refDelim)
48+
return advance, token, nil
49+
}
50+
// If we're at EOF, we have a final, non-terminated reference. Return it.
51+
if atEOF {
52+
return len(data), data, nil
53+
}
54+
// Not yet a full field. Request more data.
55+
return 0, nil, nil
56+
})
57+
58+
return &Parser{
59+
scanner: scanner,
60+
format: format,
61+
err: nil,
62+
}
63+
}
64+
65+
// Next returns the next reference as a collection of key-value pairs. nil
66+
// denotes EOF but is also returned on errors. The Err method should always be
67+
// consulted after Next returning nil.
68+
//
69+
// It could, for example return something like:
70+
//
71+
// { "objecttype": "tag", "refname:short": "v1.16.4", "object": "f460b7543ed500e49c133c2cd85c8c55ee9dbe27" }
72+
//
73+
func (p *Parser) Next() map[string]string {
74+
if !p.scanner.Scan() {
75+
return nil
76+
}
77+
fields, err := p.parseRef(p.scanner.Text())
78+
if err != nil {
79+
p.err = err
80+
return nil
81+
}
82+
return fields
83+
}
84+
85+
// Err returns the latest encountered parsing error.
86+
func (p *Parser) Err() error {
87+
return p.err
88+
}
89+
90+
// parseRef parses out all key-value pairs from a single reference block, such as
91+
//
92+
// "objecttype tag\0refname:short v1.16.4\0object f460b7543ed500e49c133c2cd85c8c55ee9dbe27"
93+
//
94+
func (p *Parser) parseRef(refBlock string) (map[string]string, error) {
95+
if refBlock == "" {
96+
// must be at EOF
97+
return nil, nil
98+
}
99+
100+
fieldValues := make(map[string]string)
101+
102+
fields := strings.Split(refBlock, p.format.fieldDelimStr)
103+
if len(fields) != len(p.format.fieldNames) {
104+
return nil, fmt.Errorf("unexpected number of reference fields: wanted %d, was %d",
105+
len(fields), len(p.format.fieldNames))
106+
}
107+
for i, field := range fields {
108+
field = strings.TrimSpace(field)
109+
110+
var fieldKey string
111+
var fieldVal string
112+
firstSpace := strings.Index(field, " ")
113+
if firstSpace > 0 {
114+
fieldKey = field[:firstSpace]
115+
fieldVal = field[firstSpace+1:]
116+
} else {
117+
// could be the case if the requested field had no value
118+
fieldKey = field
119+
}
120+
121+
// enforce the format order of fields
122+
if p.format.fieldNames[i] != fieldKey {
123+
return nil, fmt.Errorf("unexpected field name at position %d: wanted: '%s', was: '%s'",
124+
i, p.format.fieldNames[i], fieldKey)
125+
}
126+
127+
fieldValues[fieldKey] = fieldVal
128+
}
129+
130+
return fieldValues, nil
131+
}

0 commit comments

Comments
 (0)