Skip to content

Commit 4e27cc4

Browse files
ethantkoeniglafriks
authored andcommitted
Fix synchronization bug in repo indexer (#3455) (#3461)
1 parent f61ef28 commit 4e27cc4

File tree

1 file changed

+100
-62
lines changed

1 file changed

+100
-62
lines changed

models/repo_indexer.go

Lines changed: 100 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
package models
66

77
import (
8-
"io/ioutil"
9-
"os"
10-
"path"
8+
"fmt"
119
"strconv"
1210
"strings"
1311

@@ -16,8 +14,6 @@ import (
1614
"code.gitea.io/gitea/modules/indexer"
1715
"code.gitea.io/gitea/modules/log"
1816
"code.gitea.io/gitea/modules/setting"
19-
20-
"github.com/Unknwon/com"
2117
)
2218

2319
// RepoIndexerStatus status of a repo's entry in the repo indexer
@@ -132,77 +128,86 @@ func populateRepoIndexer(maxRepoID int64) {
132128
}
133129

134130
func updateRepoIndexer(repo *Repository) error {
135-
changes, err := getRepoChanges(repo)
131+
sha, err := getDefaultBranchSha(repo)
132+
if err != nil {
133+
return err
134+
}
135+
changes, err := getRepoChanges(repo, sha)
136136
if err != nil {
137137
return err
138138
} else if changes == nil {
139139
return nil
140140
}
141141

142142
batch := indexer.RepoIndexerBatch()
143-
for _, filename := range changes.UpdatedFiles {
144-
if err := addUpdate(filename, repo, batch); err != nil {
143+
for _, update := range changes.Updates {
144+
if err := addUpdate(update, repo, batch); err != nil {
145145
return err
146146
}
147147
}
148-
for _, filename := range changes.RemovedFiles {
148+
for _, filename := range changes.RemovedFilenames {
149149
if err := addDelete(filename, repo, batch); err != nil {
150150
return err
151151
}
152152
}
153153
if err = batch.Flush(); err != nil {
154154
return err
155155
}
156-
return updateLastIndexSync(repo)
156+
return repo.updateIndexerStatus(sha)
157157
}
158158

159159
// repoChanges changes (file additions/updates/removals) to a repo
160160
type repoChanges struct {
161-
UpdatedFiles []string
162-
RemovedFiles []string
161+
Updates []fileUpdate
162+
RemovedFilenames []string
163163
}
164164

165-
// getRepoChanges returns changes to repo since last indexer update
166-
func getRepoChanges(repo *Repository) (*repoChanges, error) {
167-
repoWorkingPool.CheckIn(com.ToStr(repo.ID))
168-
defer repoWorkingPool.CheckOut(com.ToStr(repo.ID))
165+
type fileUpdate struct {
166+
Filename string
167+
BlobSha string
168+
}
169169

170-
if err := repo.UpdateLocalCopyBranch(""); err != nil {
171-
return nil, err
172-
} else if !git.IsBranchExist(repo.LocalCopyPath(), repo.DefaultBranch) {
173-
// repo does not have any commits yet, so nothing to update
174-
return nil, nil
175-
} else if err = repo.UpdateLocalCopyBranch(repo.DefaultBranch); err != nil {
176-
return nil, err
177-
} else if err = repo.getIndexerStatus(); err != nil {
170+
func getDefaultBranchSha(repo *Repository) (string, error) {
171+
stdout, err := git.NewCommand("show-ref", "-s", repo.DefaultBranch).RunInDir(repo.RepoPath())
172+
if err != nil {
173+
return "", err
174+
}
175+
return strings.TrimSpace(stdout), nil
176+
}
177+
178+
// getRepoChanges returns changes to repo since last indexer update
179+
func getRepoChanges(repo *Repository, revision string) (*repoChanges, error) {
180+
if err := repo.getIndexerStatus(); err != nil {
178181
return nil, err
179182
}
180183

181184
if len(repo.IndexerStatus.CommitSha) == 0 {
182-
return genesisChanges(repo)
185+
return genesisChanges(repo, revision)
183186
}
184-
return nonGenesisChanges(repo)
187+
return nonGenesisChanges(repo, revision)
185188
}
186189

187-
func addUpdate(filename string, repo *Repository, batch *indexer.Batch) error {
188-
filepath := path.Join(repo.LocalCopyPath(), filename)
189-
if stat, err := os.Stat(filepath); err != nil {
190+
func addUpdate(update fileUpdate, repo *Repository, batch *indexer.Batch) error {
191+
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
192+
RunInDir(repo.RepoPath())
193+
if err != nil {
190194
return err
191-
} else if stat.Size() > setting.Indexer.MaxIndexerFileSize {
192-
return nil
193-
} else if stat.IsDir() {
194-
// file could actually be a directory, if it is the root of a submodule.
195-
// We do not index submodule contents, so don't do anything.
195+
}
196+
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
197+
return fmt.Errorf("Misformatted git cat-file output: %v", err)
198+
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
196199
return nil
197200
}
198-
fileContents, err := ioutil.ReadFile(filepath)
201+
202+
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
203+
RunInDirBytes(repo.RepoPath())
199204
if err != nil {
200205
return err
201206
} else if !base.IsTextFile(fileContents) {
202207
return nil
203208
}
204209
return batch.Add(indexer.RepoIndexerUpdate{
205-
Filepath: filename,
210+
Filepath: update.Filename,
206211
Op: indexer.RepoIndexerOpUpdate,
207212
Data: &indexer.RepoIndexerData{
208213
RepoID: repo.ID,
@@ -221,42 +226,76 @@ func addDelete(filename string, repo *Repository, batch *indexer.Batch) error {
221226
})
222227
}
223228

224-
// genesisChanges get changes to add repo to the indexer for the first time
225-
func genesisChanges(repo *Repository) (*repoChanges, error) {
226-
var changes repoChanges
227-
stdout, err := git.NewCommand("ls-files").RunInDir(repo.LocalCopyPath())
228-
if err != nil {
229-
return nil, err
230-
}
231-
for _, line := range strings.Split(stdout, "\n") {
232-
filename := strings.TrimSpace(line)
233-
if len(filename) == 0 {
229+
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
230+
func parseGitLsTreeOutput(stdout string) ([]fileUpdate, error) {
231+
lines := strings.Split(stdout, "\n")
232+
updates := make([]fileUpdate, 0, len(lines))
233+
for _, line := range lines {
234+
// expect line to be "<mode> <object-type> <object-sha>\t<filename>"
235+
line = strings.TrimSpace(line)
236+
if len(line) == 0 {
234237
continue
235-
} else if filename[0] == '"' {
238+
}
239+
firstSpaceIndex := strings.IndexByte(line, ' ')
240+
if firstSpaceIndex < 0 {
241+
log.Error(4, "Misformatted git ls-tree output: %s", line)
242+
continue
243+
}
244+
tabIndex := strings.IndexByte(line, '\t')
245+
if tabIndex < 42+firstSpaceIndex || tabIndex == len(line)-1 {
246+
log.Error(4, "Misformatted git ls-tree output: %s", line)
247+
continue
248+
}
249+
if objectType := line[firstSpaceIndex+1 : tabIndex-41]; objectType != "blob" {
250+
// submodules appear as commit objects, we do not index submodules
251+
continue
252+
}
253+
254+
blobSha := line[tabIndex-40 : tabIndex]
255+
filename := line[tabIndex+1:]
256+
if filename[0] == '"' {
257+
var err error
236258
filename, err = strconv.Unquote(filename)
237259
if err != nil {
238260
return nil, err
239261
}
240262
}
241-
changes.UpdatedFiles = append(changes.UpdatedFiles, filename)
263+
updates = append(updates, fileUpdate{
264+
Filename: filename,
265+
BlobSha: blobSha,
266+
})
267+
}
268+
return updates, nil
269+
}
270+
271+
// genesisChanges get changes to add repo to the indexer for the first time
272+
func genesisChanges(repo *Repository, revision string) (*repoChanges, error) {
273+
var changes repoChanges
274+
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
275+
RunInDir(repo.RepoPath())
276+
if err != nil {
277+
return nil, err
242278
}
243-
return &changes, nil
279+
changes.Updates, err = parseGitLsTreeOutput(stdout)
280+
return &changes, err
244281
}
245282

246283
// nonGenesisChanges get changes since the previous indexer update
247-
func nonGenesisChanges(repo *Repository) (*repoChanges, error) {
284+
func nonGenesisChanges(repo *Repository, revision string) (*repoChanges, error) {
248285
diffCmd := git.NewCommand("diff", "--name-status",
249-
repo.IndexerStatus.CommitSha, "HEAD")
250-
stdout, err := diffCmd.RunInDir(repo.LocalCopyPath())
286+
repo.IndexerStatus.CommitSha, revision)
287+
stdout, err := diffCmd.RunInDir(repo.RepoPath())
251288
if err != nil {
252289
// previous commit sha may have been removed by a force push, so
253290
// try rebuilding from scratch
291+
log.Warn("git diff: %v", err)
254292
if err = indexer.DeleteRepoFromIndexer(repo.ID); err != nil {
255293
return nil, err
256294
}
257-
return genesisChanges(repo)
295+
return genesisChanges(repo, revision)
258296
}
259297
var changes repoChanges
298+
updatedFilenames := make([]string, 0, 10)
260299
for _, line := range strings.Split(stdout, "\n") {
261300
line = strings.TrimSpace(line)
262301
if len(line) == 0 {
@@ -274,23 +313,22 @@ func nonGenesisChanges(repo *Repository) (*repoChanges, error) {
274313

275314
switch status := line[0]; status {
276315
case 'M', 'A':
277-
changes.UpdatedFiles = append(changes.UpdatedFiles, filename)
316+
updatedFilenames = append(updatedFilenames, filename)
278317
case 'D':
279-
changes.RemovedFiles = append(changes.RemovedFiles, filename)
318+
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
280319
default:
281320
log.Warn("Unrecognized status: %c (line=%s)", status, line)
282321
}
283322
}
284-
return &changes, nil
285-
}
286323

287-
func updateLastIndexSync(repo *Repository) error {
288-
stdout, err := git.NewCommand("rev-parse", "HEAD").RunInDir(repo.LocalCopyPath())
324+
cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
325+
cmd.AddArguments(updatedFilenames...)
326+
stdout, err = cmd.RunInDir(repo.RepoPath())
289327
if err != nil {
290-
return err
328+
return nil, err
291329
}
292-
sha := strings.TrimSpace(stdout)
293-
return repo.updateIndexerStatus(sha)
330+
changes.Updates, err = parseGitLsTreeOutput(stdout)
331+
return &changes, err
294332
}
295333

296334
func processRepoIndexerOperationQueue() {

0 commit comments

Comments
 (0)