Skip to content

Commit 7fdc048

Browse files
authored
Patch in exact search for meilisearch (#29671)
meilisearch does not have an search option to contorl fuzzynes per query right now: - meilisearch/meilisearch#1192 - https://github.com/orgs/meilisearch/discussions/377 - meilisearch/meilisearch#1096 so we have to create a workaround by post-filter the search result in gitea until this is addressed. For future works I added an option in backend only atm, to enable fuzzynes for issue indexer too. And also refactored the code so the fuzzy option is equal in logic to code indexer --- *Sponsored by Kithara Software GmbH*
1 parent baeb251 commit 7fdc048

File tree

14 files changed

+184
-33
lines changed

14 files changed

+184
-33
lines changed

modules/indexer/code/bleve/bleve.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {
233233

234234
// Search searches for files in the specified repo.
235235
// Returns the matching file-paths
236-
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
236+
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
237237
var (
238238
indexerQuery query.Query
239239
keywordQuery query.Query
240240
)
241241

242-
if isMatch {
243-
prefixQuery := bleve.NewPrefixQuery(keyword)
244-
prefixQuery.FieldVal = "Content"
245-
keywordQuery = prefixQuery
246-
} else {
242+
if isFuzzy {
247243
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
248244
phraseQuery.FieldVal = "Content"
249245
phraseQuery.Analyzer = repoIndexerAnalyzer
250246
keywordQuery = phraseQuery
247+
} else {
248+
prefixQuery := bleve.NewPrefixQuery(keyword)
249+
prefixQuery.FieldVal = "Content"
250+
keywordQuery = prefixQuery
251251
}
252252

253253
if len(repoIDs) > 0 {

modules/indexer/code/elasticsearch/elasticsearch.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
281281
}
282282

283283
// Search searches for codes and language stats by given conditions.
284-
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
285-
searchType := esMultiMatchTypeBestFields
286-
if isMatch {
287-
searchType = esMultiMatchTypePhrasePrefix
284+
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
285+
searchType := esMultiMatchTypePhrasePrefix
286+
if isFuzzy {
287+
searchType = esMultiMatchTypeBestFields
288288
}
289289

290290
kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)

modules/indexer/code/indexer_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
7070

7171
for _, kw := range keywords {
7272
t.Run(kw.Keyword, func(t *testing.T) {
73-
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
73+
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
7474
assert.NoError(t, err)
7575
assert.Len(t, kw.IDs, int(total))
7676
assert.Len(t, langs, kw.Langs)

modules/indexer/code/internal/indexer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type Indexer interface {
1616
internal.Indexer
1717
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
1818
Delete(ctx context.Context, repoID int64) error
19-
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
19+
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
2020
}
2121

2222
// NewDummyIndexer returns a dummy indexer
@@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
3838
return fmt.Errorf("indexer is not ready")
3939
}
4040

41-
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
41+
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
4242
return 0, nil, nil, fmt.Errorf("indexer is not ready")
4343
}

modules/indexer/code/search.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
124124
}
125125

126126
// PerformSearch perform a search on a repository
127-
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
127+
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
128+
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
128129
if len(keyword) == 0 {
129130
return 0, nil, nil, nil
130131
}
131132

132-
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
133+
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
133134
if err != nil {
134135
return 0, nil, nil, err
135136
}

modules/indexer/internal/bleve/query.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
2525
return q
2626
}
2727

28+
// PrefixQuery generates a match prefix query for the given prefix and field
29+
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
30+
q := bleve.NewPrefixQuery(matchPrefix)
31+
q.FieldVal = field
32+
return q
33+
}
34+
2835
// BoolFieldQuery generates a bool field query for the given value and field
2936
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
3037
q := bleve.NewBoolFieldQuery(value)

modules/indexer/issues/bleve/bleve.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
156156
var queries []query.Query
157157

158158
if options.Keyword != "" {
159-
keywordQueries := []query.Query{
160-
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
161-
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
162-
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
159+
if options.IsFuzzyKeyword {
160+
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
161+
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
162+
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
163+
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
164+
}...))
165+
} else {
166+
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
167+
inner_bleve.PrefixQuery(options.Keyword, "title"),
168+
inner_bleve.PrefixQuery(options.Keyword, "content"),
169+
inner_bleve.PrefixQuery(options.Keyword, "comments"),
170+
}...))
163171
}
164-
queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
165172
}
166173

167174
if len(options.RepoIDs) > 0 || options.AllPublic {

modules/indexer/issues/elasticsearch/elasticsearch.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ import (
1919

2020
const (
2121
issueIndexerLatestVersion = 1
22+
// multi-match-types, currently only 2 types are used
23+
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
24+
esMultiMatchTypeBestFields = "best_fields"
25+
esMultiMatchTypePhrasePrefix = "phrase_prefix"
2226
)
2327

2428
var _ internal.Indexer = &Indexer{}
@@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
141145
query := elastic.NewBoolQuery()
142146

143147
if options.Keyword != "" {
144-
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
148+
149+
searchType := esMultiMatchTypePhrasePrefix
150+
if options.IsFuzzyKeyword {
151+
searchType = esMultiMatchTypeBestFields
152+
}
153+
154+
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
145155
}
146156

147157
if len(options.RepoIDs) > 0 {

modules/indexer/issues/internal/model.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ type SearchResult struct {
7474
type SearchOptions struct {
7575
Keyword string // keyword to search
7676

77+
IsFuzzyKeyword bool // if false the levenshtein distance is 0
78+
7779
RepoIDs []int64 // repository IDs which the issues belong to
7880
AllPublic bool // if include all public repositories
7981

modules/indexer/issues/meilisearch/meilisearch.go

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package meilisearch
55

66
import (
77
"context"
8+
"errors"
89
"strconv"
910
"strings"
1011

@@ -16,12 +17,15 @@ import (
1617
)
1718

1819
const (
19-
issueIndexerLatestVersion = 2
20+
issueIndexerLatestVersion = 3
2021

2122
// TODO: make this configurable if necessary
2223
maxTotalHits = 10000
2324
)
2425

26+
// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
27+
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
28+
2529
var _ internal.Indexer = &Indexer{}
2630

2731
// Indexer implements Indexer interface
@@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
4751
},
4852
DisplayedAttributes: []string{
4953
"id",
54+
"title",
55+
"content",
56+
"comments",
5057
},
5158
FilterableAttributes: []string{
5259
"repo_id",
@@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
221228
return nil, err
222229
}
223230

224-
hits := make([]internal.Match, 0, len(searchRes.Hits))
225-
for _, hit := range searchRes.Hits {
226-
hits = append(hits, internal.Match{
227-
ID: int64(hit.(map[string]any)["id"].(float64)),
228-
})
231+
hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
232+
if err != nil {
233+
return nil, err
229234
}
230235

231236
return &internal.SearchResult{
@@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
241246
}
242247
return field + ":asc"
243248
}
249+
250+
// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
251+
// and you can only change "typo tolerance" per index. So we have to post-filter the results
252+
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
253+
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
254+
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
255+
hits := make([]internal.Match, 0, len(searchRes.Hits))
256+
for _, hit := range searchRes.Hits {
257+
hit, ok := hit.(map[string]any)
258+
if !ok {
259+
return nil, ErrMalformedResponse
260+
}
261+
262+
if !isFuzzy {
263+
keyword = strings.ToLower(keyword)
264+
265+
// declare a anon func to check if the title, content or at least one comment contains the keyword
266+
found, err := func() (bool, error) {
267+
// check if title match first
268+
title, ok := hit["title"].(string)
269+
if !ok {
270+
return false, ErrMalformedResponse
271+
} else if strings.Contains(strings.ToLower(title), keyword) {
272+
return true, nil
273+
}
274+
275+
// check if content has a match
276+
content, ok := hit["content"].(string)
277+
if !ok {
278+
return false, ErrMalformedResponse
279+
} else if strings.Contains(strings.ToLower(content), keyword) {
280+
return true, nil
281+
}
282+
283+
// now check for each comment if one has a match
284+
// so we first try to cast and skip if there are no comments
285+
comments, ok := hit["comments"].([]any)
286+
if !ok {
287+
return false, ErrMalformedResponse
288+
} else if len(comments) == 0 {
289+
return false, nil
290+
}
291+
292+
// now we iterate over all and report as soon as we detect one match
293+
for i := range comments {
294+
comment, ok := comments[i].(string)
295+
if !ok {
296+
return false, ErrMalformedResponse
297+
}
298+
if strings.Contains(strings.ToLower(comment), keyword) {
299+
return true, nil
300+
}
301+
}
302+
303+
// we got no match
304+
return false, nil
305+
}()
306+
307+
if err != nil {
308+
return nil, err
309+
} else if !found {
310+
continue
311+
}
312+
}
313+
issueID, ok := hit["id"].(float64)
314+
if !ok {
315+
return nil, ErrMalformedResponse
316+
}
317+
hits = append(hits, internal.Match{
318+
ID: int64(issueID),
319+
})
320+
}
321+
return hits, nil
322+
}

modules/indexer/issues/meilisearch/meilisearch_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ import (
1010
"testing"
1111
"time"
1212

13+
"code.gitea.io/gitea/modules/indexer/issues/internal"
1314
"code.gitea.io/gitea/modules/indexer/issues/internal/tests"
15+
16+
"github.com/meilisearch/meilisearch-go"
17+
"github.com/stretchr/testify/assert"
1418
)
1519

1620
func TestMeilisearchIndexer(t *testing.T) {
@@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {
4852

4953
tests.TestIndexer(t, indexer)
5054
}
55+
56+
func TestNonFuzzyWorkaround(t *testing.T) {
57+
// get unexpected return
58+
_, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
59+
Hits: []any{"aa", "bb", "cc", "dd"},
60+
}, "bowling", false)
61+
assert.ErrorIs(t, err, ErrMalformedResponse)
62+
63+
validResponse := &meilisearch.SearchResponse{
64+
Hits: []any{
65+
map[string]any{
66+
"id": float64(11),
67+
"title": "a title",
68+
"content": "issue body with no match",
69+
"comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
70+
},
71+
map[string]any{
72+
"id": float64(22),
73+
"title": "Bowling as title",
74+
"content": "",
75+
"comments": []any{},
76+
},
77+
map[string]any{
78+
"id": float64(33),
79+
"title": "Bowl-ing as fuzzy match",
80+
"content": "",
81+
"comments": []any{},
82+
},
83+
},
84+
}
85+
86+
// nonFuzzy
87+
hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
88+
assert.NoError(t, err)
89+
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)
90+
91+
// fuzzy
92+
hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
93+
assert.NoError(t, err)
94+
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
95+
}

routers/web/explore/code.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
3535
keyword := ctx.FormTrim("q")
3636

3737
queryType := ctx.FormTrim("t")
38-
isMatch := queryType == "match"
38+
isFuzzy := queryType != "match"
3939

4040
ctx.Data["Keyword"] = keyword
4141
ctx.Data["Language"] = language
@@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
7777
)
7878

7979
if (len(repoIDs) > 0) || isAdmin {
80-
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
80+
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
8181
if err != nil {
8282
if code_indexer.IsAvailable(ctx) {
8383
ctx.ServerError("SearchResults", err)

routers/web/repo/search.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func Search(ctx *context.Context) {
2525
keyword := ctx.FormTrim("q")
2626

2727
queryType := ctx.FormTrim("t")
28-
isMatch := queryType == "match"
28+
isFuzzy := queryType != "match"
2929

3030
ctx.Data["Keyword"] = keyword
3131
ctx.Data["Language"] = language
@@ -43,7 +43,7 @@ func Search(ctx *context.Context) {
4343
}
4444

4545
total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
46-
language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
46+
language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
4747
if err != nil {
4848
if code_indexer.IsAvailable(ctx) {
4949
ctx.ServerError("SearchResults", err)

routers/web/user/code.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
4040
keyword := ctx.FormTrim("q")
4141

4242
queryType := ctx.FormTrim("t")
43-
isMatch := queryType == "match"
43+
isFuzzy := queryType != "match"
4444

4545
ctx.Data["Keyword"] = keyword
4646
ctx.Data["Language"] = language
@@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
7575
)
7676

7777
if len(repoIDs) > 0 {
78-
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
78+
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
7979
if err != nil {
8080
if code_indexer.IsAvailable(ctx) {
8181
ctx.ServerError("SearchResults", err)

0 commit comments

Comments
 (0)