Skip to content

Commit 97a7c04

Browse files
GiteaBotwxiaoguang
andauthored
Fix bleve fuzziness (#30799) (#30804)
Backport #30799 by wxiaoguang Fix #30797 Fix #30317 Co-authored-by: wxiaoguang <[email protected]>
1 parent 99e89e5 commit 97a7c04

File tree

4 files changed

+16
-10
lines changed

4 files changed

+16
-10
lines changed

modules/indexer/code/bleve/bleve.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ import (
3939
const (
4040
unicodeNormalizeName = "unicodeNormalize"
4141
maxBatchSize = 16
42-
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
43-
fuzzyDenominator = 4
4442
)
4543

4644
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@@ -245,7 +243,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
245243
phraseQuery.Analyzer = repoIndexerAnalyzer
246244
keywordQuery = phraseQuery
247245
if opts.IsKeywordFuzzy {
248-
phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
246+
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
249247
}
250248

251249
if len(opts.RepoIDs) > 0 {

modules/indexer/internal/bleve/util.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,15 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
4747

4848
return index, 0, nil
4949
}
50+
51+
func GuessFuzzinessByKeyword(s string) int {
52+
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
53+
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
54+
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
55+
for _, r := range s {
56+
if r >= 128 {
57+
return 0
58+
}
59+
}
60+
return min(2, len(s)/4)
61+
}

modules/indexer/issues/bleve/bleve.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
3535
})
3636
}
3737

38-
const (
39-
maxBatchSize = 16
40-
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
41-
fuzzyDenominator = 4
42-
)
38+
const maxBatchSize = 16
4339

4440
// IndexerData an update to the issue indexer
4541
type IndexerData internal.IndexerData
@@ -162,7 +158,7 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
162158
if options.Keyword != "" {
163159
fuzziness := 0
164160
if options.IsFuzzyKeyword {
165-
fuzziness = len(options.Keyword) / fuzzyDenominator
161+
fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
166162
}
167163

168164
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{

routers/web/repo/search.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func Search(ctx *context.Context) {
2828
ctx.Data["Language"] = language
2929
ctx.Data["IsFuzzy"] = isFuzzy
3030
ctx.Data["PageIsViewCode"] = true
31+
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
3132

3233
if keyword == "" {
3334
ctx.HTML(http.StatusOK, tplSearch)
@@ -86,7 +87,6 @@ func Search(ctx *context.Context) {
8687
}
8788
}
8889

89-
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
9090
ctx.Data["Repo"] = ctx.Repo.Repository
9191
ctx.Data["SearchResults"] = searchResults
9292
ctx.Data["SearchResultLanguages"] = searchResultLanguages

0 commit comments

Comments
 (0)