Skip to content

Commit 89b4e04

Browse files
authored
Refactor code indexer (#9313)
* Refactor code indexer * fix test * fix test * refactor code indexer * fix import * improve code * fix typo * fix test and make code clean * fix lint
1 parent 2f9564f commit 89b4e04

File tree

13 files changed

+649
-638
lines changed

13 files changed

+649
-638
lines changed

modules/indexer/code/bleve.go

+247-282
Large diffs are not rendered by default.

modules/indexer/code/bleve_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,66 @@
55
package code
66

77
import (
8+
"os"
89
"path/filepath"
910
"testing"
1011

1112
"code.gitea.io/gitea/models"
13+
"code.gitea.io/gitea/modules/log"
14+
"code.gitea.io/gitea/modules/setting"
15+
16+
"github.com/stretchr/testify/assert"
1217
)
1318

1419
func TestMain(m *testing.M) {
1520
models.MainTest(m, filepath.Join("..", "..", ".."))
1621
}
22+
23+
func TestIndexAndSearch(t *testing.T) {
24+
models.PrepareTestEnv(t)
25+
26+
dir := "./bleve.index"
27+
os.RemoveAll(dir)
28+
29+
setting.Indexer.RepoIndexerEnabled = true
30+
idx, _, err := NewBleveIndexer(dir)
31+
if err != nil {
32+
idx.Close()
33+
log.Fatal("indexer.Init: %v", err)
34+
}
35+
36+
err = idx.Index(1)
37+
assert.NoError(t, err)
38+
39+
var (
40+
keywords = []struct {
41+
Keyword string
42+
IDs []int64
43+
}{
44+
{
45+
Keyword: "Description",
46+
IDs: []int64{1},
47+
},
48+
{
49+
Keyword: "repo1",
50+
IDs: []int64{1},
51+
},
52+
{
53+
Keyword: "non-exist",
54+
IDs: []int64{},
55+
},
56+
}
57+
)
58+
59+
for _, kw := range keywords {
60+
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
61+
assert.NoError(t, err)
62+
assert.EqualValues(t, len(kw.IDs), total)
63+
64+
var ids = make([]int64, 0, len(res))
65+
for _, hit := range res {
66+
ids = append(ids, hit.RepoID)
67+
}
68+
assert.EqualValues(t, kw.IDs, ids)
69+
}
70+
}

modules/indexer/code/git.go

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// Copyright 2019 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package code
6+
7+
import (
8+
"strconv"
9+
"strings"
10+
11+
"code.gitea.io/gitea/models"
12+
"code.gitea.io/gitea/modules/git"
13+
"code.gitea.io/gitea/modules/log"
14+
"code.gitea.io/gitea/modules/setting"
15+
)
16+
17+
type fileUpdate struct {
18+
Filename string
19+
BlobSha string
20+
}
21+
22+
// repoChanges changes (file additions/updates/removals) to a repo
23+
type repoChanges struct {
24+
Updates []fileUpdate
25+
RemovedFilenames []string
26+
}
27+
28+
func getDefaultBranchSha(repo *models.Repository) (string, error) {
29+
stdout, err := git.NewCommand("show-ref", "-s", git.BranchPrefix+repo.DefaultBranch).RunInDir(repo.RepoPath())
30+
if err != nil {
31+
return "", err
32+
}
33+
return strings.TrimSpace(stdout), nil
34+
}
35+
36+
// getRepoChanges returns changes to repo since last indexer update
37+
func getRepoChanges(repo *models.Repository, revision string) (*repoChanges, error) {
38+
if err := repo.GetIndexerStatus(); err != nil {
39+
return nil, err
40+
}
41+
42+
if len(repo.IndexerStatus.CommitSha) == 0 {
43+
return genesisChanges(repo, revision)
44+
}
45+
return nonGenesisChanges(repo, revision)
46+
}
47+
48+
func isIndexable(entry *git.TreeEntry) bool {
49+
if !entry.IsRegular() && !entry.IsExecutable() {
50+
return false
51+
}
52+
name := strings.ToLower(entry.Name())
53+
for _, g := range setting.Indexer.ExcludePatterns {
54+
if g.Match(name) {
55+
return false
56+
}
57+
}
58+
for _, g := range setting.Indexer.IncludePatterns {
59+
if g.Match(name) {
60+
return true
61+
}
62+
}
63+
return len(setting.Indexer.IncludePatterns) == 0
64+
}
65+
66+
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
67+
func parseGitLsTreeOutput(stdout []byte) ([]fileUpdate, error) {
68+
entries, err := git.ParseTreeEntries(stdout)
69+
if err != nil {
70+
return nil, err
71+
}
72+
var idxCount = 0
73+
updates := make([]fileUpdate, len(entries))
74+
for _, entry := range entries {
75+
if isIndexable(entry) {
76+
updates[idxCount] = fileUpdate{
77+
Filename: entry.Name(),
78+
BlobSha: entry.ID.String(),
79+
}
80+
idxCount++
81+
}
82+
}
83+
return updates[:idxCount], nil
84+
}
85+
86+
// genesisChanges get changes to add repo to the indexer for the first time
87+
func genesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
88+
var changes repoChanges
89+
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
90+
RunInDirBytes(repo.RepoPath())
91+
if err != nil {
92+
return nil, err
93+
}
94+
changes.Updates, err = parseGitLsTreeOutput(stdout)
95+
return &changes, err
96+
}
97+
98+
// nonGenesisChanges get changes since the previous indexer update
99+
func nonGenesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
100+
diffCmd := git.NewCommand("diff", "--name-status",
101+
repo.IndexerStatus.CommitSha, revision)
102+
stdout, err := diffCmd.RunInDir(repo.RepoPath())
103+
if err != nil {
104+
// previous commit sha may have been removed by a force push, so
105+
// try rebuilding from scratch
106+
log.Warn("git diff: %v", err)
107+
if err = indexer.Delete(repo.ID); err != nil {
108+
return nil, err
109+
}
110+
return genesisChanges(repo, revision)
111+
}
112+
var changes repoChanges
113+
updatedFilenames := make([]string, 0, 10)
114+
for _, line := range strings.Split(stdout, "\n") {
115+
line = strings.TrimSpace(line)
116+
if len(line) == 0 {
117+
continue
118+
}
119+
filename := strings.TrimSpace(line[1:])
120+
if len(filename) == 0 {
121+
continue
122+
} else if filename[0] == '"' {
123+
filename, err = strconv.Unquote(filename)
124+
if err != nil {
125+
return nil, err
126+
}
127+
}
128+
129+
switch status := line[0]; status {
130+
case 'M', 'A':
131+
updatedFilenames = append(updatedFilenames, filename)
132+
case 'D':
133+
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
134+
default:
135+
log.Warn("Unrecognized status: %c (line=%s)", status, line)
136+
}
137+
}
138+
139+
cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
140+
cmd.AddArguments(updatedFilenames...)
141+
lsTreeStdout, err := cmd.RunInDirBytes(repo.RepoPath())
142+
if err != nil {
143+
return nil, err
144+
}
145+
changes.Updates, err = parseGitLsTreeOutput(lsTreeStdout)
146+
return &changes, err
147+
}

modules/indexer/code/indexer.go

+54-53
Original file line numberDiff line numberDiff line change
@@ -5,72 +5,73 @@
55
package code
66

77
import (
8-
"os"
9-
"strconv"
8+
"time"
109

10+
"code.gitea.io/gitea/modules/graceful"
11+
"code.gitea.io/gitea/modules/log"
1112
"code.gitea.io/gitea/modules/setting"
13+
)
1214

13-
"github.com/blevesearch/bleve"
14-
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
15-
"github.com/blevesearch/bleve/index/upsidedown"
16-
"github.com/blevesearch/bleve/mapping"
17-
"github.com/blevesearch/bleve/search/query"
18-
"github.com/ethantkoenig/rupture"
15+
var (
16+
indexer Indexer
1917
)
2018

21-
// indexerID a bleve-compatible unique identifier for an integer id
22-
func indexerID(id int64) string {
23-
return strconv.FormatInt(id, 36)
19+
// SearchResult result of performing a search in a repo
20+
type SearchResult struct {
21+
RepoID int64
22+
StartIndex int
23+
EndIndex int
24+
Filename string
25+
Content string
2426
}
2527

26-
// numericEqualityQuery a numeric equality query for the given value and field
27-
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
28-
f := float64(value)
29-
tru := true
30-
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
31-
q.SetField(field)
32-
return q
28+
// Indexer defines an interface to indexer issues contents
29+
type Indexer interface {
30+
Index(repoID int64) error
31+
Delete(repoID int64) error
32+
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
33+
Close()
3334
}
3435

35-
const unicodeNormalizeName = "unicodeNormalize"
36+
// Init initialize the repo indexer
37+
func Init() {
38+
if !setting.Indexer.RepoIndexerEnabled {
39+
return
40+
}
3641

37-
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
38-
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
39-
"type": unicodenorm.Name,
40-
"form": unicodenorm.NFC,
41-
})
42-
}
42+
waitChannel := make(chan time.Duration)
43+
go func() {
44+
start := time.Now()
45+
log.Info("Initializing Repository Indexer")
46+
var created bool
47+
var err error
48+
indexer, created, err = NewBleveIndexer(setting.Indexer.RepoPath)
49+
if err != nil {
50+
indexer.Close()
51+
log.Fatal("indexer.Init: %v", err)
52+
}
4353

44-
const maxBatchSize = 16
54+
go processRepoIndexerOperationQueue(indexer)
4555

46-
// openIndexer open the index at the specified path, checking for metadata
47-
// updates and bleve version updates. If index needs to be created (or
48-
// re-created), returns (nil, nil)
49-
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
50-
_, err := os.Stat(setting.Indexer.IssuePath)
51-
if err != nil && os.IsNotExist(err) {
52-
return nil, nil
53-
} else if err != nil {
54-
return nil, err
55-
}
56+
if created {
57+
go populateRepoIndexer()
58+
}
5659

57-
metadata, err := rupture.ReadIndexMetadata(path)
58-
if err != nil {
59-
return nil, err
60-
}
61-
if metadata.Version < latestVersion {
62-
// the indexer is using a previous version, so we should delete it and
63-
// re-populate
64-
return nil, os.RemoveAll(path)
65-
}
60+
waitChannel <- time.Since(start)
61+
}()
6662

67-
index, err := bleve.Open(path)
68-
if err != nil && err == upsidedown.IncompatibleVersion {
69-
// the indexer was built with a previous version of bleve, so we should
70-
// delete it and re-populate
71-
return nil, os.RemoveAll(path)
72-
} else if err != nil {
73-
return nil, err
63+
if setting.Indexer.StartupTimeout > 0 {
64+
go func() {
65+
timeout := setting.Indexer.StartupTimeout
66+
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
67+
timeout += setting.GracefulHammerTime
68+
}
69+
select {
70+
case duration := <-waitChannel:
71+
log.Info("Repository Indexer Initialization took %v", duration)
72+
case <-time.After(timeout):
73+
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
74+
}
75+
}()
7476
}
75-
return index, nil
7677
}

0 commit comments

Comments
 (0)