From bf87466f8d8ee6f11a964dcff57d257833dd174f Mon Sep 17 00:00:00 2001 From: "m.huber" Date: Sun, 10 Mar 2024 22:34:58 +0100 Subject: [PATCH 1/9] determine fuzziness of bleve indexer by keyword length --- modules/indexer/code/bleve/bleve.go | 15 +++++++-------- modules/indexer/internal/bleve/query.go | 10 ++-------- modules/indexer/issues/bleve/bleve.go | 25 +++++++++++++------------ 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 107dd23598d1..cb75009052b7 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -39,6 +39,8 @@ import ( const ( unicodeNormalizeName = "unicodeNormalize" maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 ) func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { @@ -239,15 +241,12 @@ func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword keywordQuery query.Query ) + phraseQuery := bleve.NewMatchPhraseQuery(keyword) + phraseQuery.FieldVal = "Content" + phraseQuery.Analyzer = repoIndexerAnalyzer + keywordQuery = phraseQuery if isFuzzy { - phraseQuery := bleve.NewMatchPhraseQuery(keyword) - phraseQuery.FieldVal = "Content" - phraseQuery.Analyzer = repoIndexerAnalyzer - keywordQuery = phraseQuery - } else { - prefixQuery := bleve.NewPrefixQuery(keyword) - prefixQuery.FieldVal = "Content" - keywordQuery = prefixQuery + phraseQuery.Fuzziness = len(keyword) / fuzzyDenominator } if len(repoIDs) > 0 { diff --git a/modules/indexer/internal/bleve/query.go b/modules/indexer/internal/bleve/query.go index 2a427c402026..49c0d58ac105 100644 --- a/modules/indexer/internal/bleve/query.go +++ b/modules/indexer/internal/bleve/query.go @@ -18,17 +18,11 @@ func NumericEqualityQuery(value int64, field string) *query.NumericRangeQuery { } // MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer -func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQuery { +func MatchPhraseQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchPhraseQuery { q := bleve.NewMatchPhraseQuery(matchPhrase) q.FieldVal = field q.Analyzer = analyzer - return q -} - -// PrefixQuery generates a match prefix query for the given prefix and field -func PrefixQuery(matchPrefix, field string) *query.PrefixQuery { - q := bleve.NewPrefixQuery(matchPrefix) - q.FieldVal = field + q.Fuzziness = fuzziness return q } diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index aaea854efa03..15b6376daed5 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -35,7 +35,11 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { }) } -const maxBatchSize = 16 +const ( + maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 +) // IndexerData an update to the issue indexer type IndexerData internal.IndexerData @@ -156,19 +160,16 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( var queries []query.Query if options.Keyword != "" { + fuzziness := 0 if options.IsFuzzyKeyword { - queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ - inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer), - }...)) - } else { - queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ - inner_bleve.PrefixQuery(options.Keyword, "title"), - inner_bleve.PrefixQuery(options.Keyword, "content"), - inner_bleve.PrefixQuery(options.Keyword, "comments"), - }...)) + fuzziness = len(options.Keyword) / fuzzyDenominator } + + queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ + inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness), + inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness), + inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness), + }...)) } if len(options.RepoIDs) > 0 || options.AllPublic { From 66a6aadf26a61463de1aa9074bb7a24c5407459c Mon Sep 17 00:00:00 2001 From: "m.huber" Date: Mon, 11 Mar 2024 16:00:27 +0100 Subject: [PATCH 2/9] adjust integration test --- modules/indexer/code/indexer.go | 8 ++++++++ tests/integration/repo_search_test.go | 28 ++++++++++++++++++++------- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index ebebf6ba8a28..6c29cf630aa3 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -311,3 +311,11 @@ func populateRepoIndexer(ctx context.Context) { } log.Info("Done (re)populating the repo indexer with existing repositories") } + +// GetQueueItemNumber is used in integration tests +func GetQueueItemNumber() int { + if indexerQueue != nil { + return indexerQueue.GetQueueItemNumber() + } + return -1 +} diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index cf199e98c289..ee07348abeb9 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -6,6 +6,7 @@ package integration import ( "net/http" "testing" + "time" "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" @@ -32,7 +33,7 @@ func TestSearchRepo(t *testing.T) { repo, err := repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "repo1") assert.NoError(t, err) - executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) + executeIndexer(t, repo) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -42,12 +43,14 @@ func TestSearchRepo(t *testing.T) { repo, err = repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "glob") assert.NoError(t, err) - executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) + executeIndexer(t, repo) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) - testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt"}) - testSearch(t, "/user2/glob/search?q=file4&page=1", []string{}) - testSearch(t, "/user2/glob/search?q=file5&page=1", []string{}) + testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) + testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt", "a.txt"}) + testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt"}) + testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{}) + testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{}) } func testSearch(t *testing.T, url string, expected []string) { @@ -58,6 +61,17 @@ func testSearch(t *testing.T, url string, expected []string) { assert.EqualValues(t, expected, filenames) } -func executeIndexer(t *testing.T, repo *repo_model.Repository, op func(*repo_model.Repository)) { - op(repo) +func executeIndexer(t *testing.T, repo *repo_model.Repository) { + code_indexer.UpdateRepoIndexer(repo) + + for { + number := code_indexer.GetQueueItemNumber() + if number == 0 { + return + } + if number == -1 { + t.Fatal("Indexing failed") + } + time.Sleep(10 * time.Millisecond) + } } From f26480c060df972d6f4facd0547992ba3c94c58a Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 16 Mar 2024 17:34:16 +0100 Subject: [PATCH 3/9] queue is sync in tests --- modules/indexer/code/indexer.go | 8 -------- tests/integration/repo_search_test.go | 20 ++------------------ 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index 6c29cf630aa3..ebebf6ba8a28 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -311,11 +311,3 @@ func populateRepoIndexer(ctx context.Context) { } log.Info("Done (re)populating the repo indexer with existing repositories") } - -// GetQueueItemNumber is used in integration tests -func GetQueueItemNumber() int { - if indexerQueue != nil { - return indexerQueue.GetQueueItemNumber() - } - return -1 -} diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index ee07348abeb9..f16430d5f756 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -6,7 +6,6 @@ package integration import ( "net/http" "testing" - "time" "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" @@ -33,7 +32,7 @@ func TestSearchRepo(t *testing.T) { repo, err := repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "repo1") assert.NoError(t, err) - executeIndexer(t, repo) + code_indexer.UpdateRepoIndexer(repo) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -43,7 +42,7 @@ func TestSearchRepo(t *testing.T) { repo, err = repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "glob") assert.NoError(t, err) - executeIndexer(t, repo) + code_indexer.UpdateRepoIndexer(repo) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) @@ -60,18 +59,3 @@ func testSearch(t *testing.T, url string, expected []string) { filenames := resultFilenames(t, NewHTMLParser(t, resp.Body)) assert.EqualValues(t, expected, filenames) } - -func executeIndexer(t *testing.T, repo *repo_model.Repository) { - code_indexer.UpdateRepoIndexer(repo) - - for { - number := code_indexer.GetQueueItemNumber() - if number == 0 { - return - } - if number == -1 { - t.Fatal("Indexing failed") - } - time.Sleep(10 * time.Millisecond) - } -} From b48ed5facf671e9fa1f002ba12709cbc433fdff0 Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 11:02:43 +0100 Subject: [PATCH 4/9] Apply suggestions from code review --- tests/integration/repo_search_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index f16430d5f756..c3550d3c8a8f 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -33,6 +33,7 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) + time.Sleep(1000) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -43,6 +44,7 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) + time.Sleep(1000) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) From 6e6ec0d3c8262d25f28db08b98bb3836cae871ed Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 11:55:10 +0100 Subject: [PATCH 5/9] Update repo_search_test.go --- tests/integration/repo_search_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index c3550d3c8a8f..f4edd368f33e 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -6,6 +6,7 @@ package integration import ( "net/http" "testing" + "time" "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" From 238e9f246b449cc22468ea8f9ece01dac90ec01e Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 11:56:52 +0100 Subject: [PATCH 6/9] Apply suggestions from code review --- tests/integration/repo_search_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index f4edd368f33e..d65efbbb0b9f 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -34,7 +34,7 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) - time.Sleep(1000) + time.Sleep(100 * time.Millisecond) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -45,7 +45,7 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) - time.Sleep(1000) + time.Sleep(100 * time.Millisecond) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) From f32bb1cad91e30e50ae47f729cc4e15e4826542c Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 14:17:06 +0100 Subject: [PATCH 7/9] Apply suggestions from code review --- tests/integration/repo_search_test.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index d65efbbb0b9f..9522283cc4a3 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -6,7 +6,6 @@ package integration import ( "net/http" "testing" - "time" "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" @@ -34,7 +33,6 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) - time.Sleep(100 * time.Millisecond) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -45,12 +43,11 @@ func TestSearchRepo(t *testing.T) { assert.NoError(t, err) code_indexer.UpdateRepoIndexer(repo) - time.Sleep(100 * time.Millisecond) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt", "a.txt"}) - testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt"}) + testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt", "a.txt"}) testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{}) testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{}) } From f807f757c4e3f5fe2cccbc93b1328ea84e31fd5c Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 15:11:42 +0100 Subject: [PATCH 8/9] Apply suggestions from code review --- tests/integration/repo_search_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index 9522283cc4a3..917a6f943719 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -48,7 +48,7 @@ func TestSearchRepo(t *testing.T) { testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt", "a.txt"}) testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt", "a.txt"}) - testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{}) + testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{"x/b.txt", "a.txt"}) testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{}) } From 8dc801ab76a9a058539d896b7b27c8b4bd2f6af8 Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 15:11:56 +0100 Subject: [PATCH 9/9] Apply suggestions from code review --- tests/integration/repo_search_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index 917a6f943719..56cc45d9010e 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -49,7 +49,7 @@ func TestSearchRepo(t *testing.T) { testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt", "a.txt"}) testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt", "a.txt"}) testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{"x/b.txt", "a.txt"}) - testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{}) + testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{"x/b.txt", "a.txt"}) } func testSearch(t *testing.T, url string, expected []string) {