From cdc27b0d62b8bfdf1445d5256394b65280c158dc Mon Sep 17 00:00:00 2001 From: Shiny Nematoda Date: Wed, 17 Dec 2025 13:51:48 +0100 Subject: [PATCH] feat: add support to opt-in for fuzzy search (#10378) The rationale for keeping it behind a flag is due to fuzzy search being computationally intensive #5261 Admins may opt-in by setting the `[indexer].REPO_INDEXER_FUZZY_ENABLED` flag to true. Closes #10331 Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/10378 Reviewed-by: Gusted Co-authored-by: Shiny Nematoda Co-committed-by: Shiny Nematoda --- modules/git/grep.go | 4 ++ modules/indexer/code/bleve/bleve.go | 8 ++- .../code/elasticsearch/elasticsearch.go | 5 +- modules/indexer/code/indexer.go | 11 ++++ modules/indexer/code/indexer_test.go | 51 +++++++++++++++++++ modules/indexer/code/internal/indexer.go | 9 +++- modules/indexer/code/search.go | 3 +- modules/setting/indexer.go | 39 +++++++------- options/locale_next/locale_en-US.json | 2 + routers/web/explore/code.go | 10 ++-- routers/web/repo/search.go | 32 ++++++------ routers/web/user/code.go | 10 ++-- tests/integration/explore_code_test.go | 43 ++++++++++++++-- 13 files changed, 175 insertions(+), 52 deletions(-) diff --git a/modules/git/grep.go b/modules/git/grep.go index b5471b8f6c..158551aa66 100644 --- a/modules/git/grep.go +++ b/modules/git/grep.go @@ -39,6 +39,10 @@ const ( // llu:TrKeysSuffix search. var GrepSearchOptions = [3]string{"exact", "union", "regexp"} +func (mode GrepMode) String() string { + return GrepSearchOptions[mode] +} + type GrepOptions struct { RefName string MaxResultLimit int diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 4c8b5f2a86..1b80c05aeb 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -259,12 +259,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int if opts.Mode == internal.CodeSearchModeUnion { query := bleve.NewDisjunctionQuery() - for _, field := range strings.Fields(opts.Keyword) { + for field := range strings.FieldsSeq(opts.Keyword) { query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false, 1.0)) } keywordQuery = query } else { - keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false, 1.0) + keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, + "Content", + repoIndexerAnalyzer, + opts.Mode == internal.CodeSearchModeFuzzy, + 1.0) } if len(opts.RepoIDs) > 0 { diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 9b11f56fb7..e87c0e8374 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -335,11 +335,14 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan // Search searches for codes and language stats by given conditions. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { searchType := esMultiMatchTypePhrase - if opts.Mode == internal.CodeSearchModeUnion { + if opts.Mode == internal.CodeSearchModeUnion || opts.Mode == internal.CodeSearchModeFuzzy { searchType = esMultiMatchTypeBestFields } kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType) + if opts.Mode == internal.CodeSearchModeFuzzy { + kwQuery = kwQuery.Fuzziness("AUTO") + } query := elastic.NewBoolQuery() query = query.Must(kwQuery) if len(opts.RepoIDs) > 0 { diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index c32b637ab4..f3ed091a30 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -91,12 +91,23 @@ func index(ctx context.Context, indexer internal.Indexer, repoID int64) error { return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha) } +func setSearchOption(set bool, val string) { + if set { + if !slices.Contains(CodeSearchOptions, val) { + CodeSearchOptions = append(CodeSearchOptions, val) + } + } else if i := slices.Index(CodeSearchOptions, val); i >= 0 { + CodeSearchOptions = append(CodeSearchOptions[:i], CodeSearchOptions[i+1:]...) + } +} + // Init initialize the repo indexer func Init() { if !setting.Indexer.RepoIndexerEnabled { (*globalIndexer.Load()).Close() return } + setSearchOption(setting.Indexer.RepoIndexerEnableFuzzy, "fuzzy") ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false) diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 740d2e1b5c..97f17b083f 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -116,6 +116,57 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }) } + t.Run("Fuzzy", func(t *testing.T) { + for _, kw := range []struct { + keyword string + ids []int64 + }{ + { + keyword: "reppo1", // should match repo1 + ids: []int64{repoID}, + }, + { + keyword: "1", // must not be fuzzy match only repo1 + ids: []int64{repoID}, + }, + { + keyword: "Description!", // should match "Description" + ids: []int64{repoID}, + }, + { + keyword: "escription", // should match "Description" + ids: []int64{repoID}, + }, + { + keyword: "form", // should match "for" + ids: []int64{repoID}, + }, + { + keyword: "invalid", // should not match anything + ids: []int64{}, + }, + } { + t.Run(kw.keyword, func(t *testing.T) { + _, res, _, err := indexer.Search(t.Context(), &internal.SearchOptions{ + Keyword: kw.keyword, + Paginator: &db.ListOptions{ + Page: 1, + PageSize: 10, + }, + Mode: SearchModeFuzzy, + }) + require.NoError(t, err) + + ids := make([]int64, 0, len(res)) + for _, hit := range res { + ids = append(ids, hit.RepoID) + } + + assert.Equal(t, kw.ids, ids) + }) + } + }) + require.NoError(t, indexer.Delete(t.Context(), repoID)) }) } diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go index 73662b1dda..32e80b7e73 100644 --- a/modules/indexer/code/internal/indexer.go +++ b/modules/indexer/code/internal/indexer.go @@ -25,13 +25,18 @@ type CodeSearchMode int const ( CodeSearchModeExact CodeSearchMode = iota CodeSearchModeUnion + CodeSearchModeFuzzy ) func (mode CodeSearchMode) String() string { - if mode == CodeSearchModeUnion { + switch mode { + case CodeSearchModeFuzzy: + return "fuzzy" + case CodeSearchModeUnion: return "union" + default: + return "exact" } - return "exact" } type SearchOptions struct { diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index 66c9497dab..2085251f1c 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -36,13 +36,14 @@ type SearchResultLanguages = internal.SearchResultLanguages type SearchOptions = internal.SearchOptions // llu:TrKeysSuffix search. -var CodeSearchOptions = [2]string{"exact", "union"} +var CodeSearchOptions = []string{"exact", "union", "fuzzy"} type SearchMode = internal.CodeSearchMode const ( SearchModeExact = internal.CodeSearchModeExact SearchModeUnion = internal.CodeSearchModeUnion + SearchModeFuzzy = internal.CodeSearchModeFuzzy ) func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) { diff --git a/modules/setting/indexer.go b/modules/setting/indexer.go index 6a464ee0de..b112a50cfa 100644 --- a/modules/setting/indexer.go +++ b/modules/setting/indexer.go @@ -23,16 +23,17 @@ var Indexer = struct { IssueIndexerName string StartupTimeout time.Duration - RepoIndexerEnabled bool - RepoIndexerRepoTypes []string - RepoType string - RepoPath string - RepoConnStr string - RepoIndexerName string - MaxIndexerFileSize int64 - IncludePatterns []Glob - ExcludePatterns []Glob - ExcludeVendored bool + RepoIndexerEnabled bool + RepoIndexerRepoTypes []string + RepoIndexerEnableFuzzy bool + RepoType string + RepoPath string + RepoConnStr string + RepoIndexerName string + MaxIndexerFileSize int64 + IncludePatterns []Glob + ExcludePatterns []Glob + ExcludeVendored bool }{ IssueType: "bleve", IssuePath: "indexers/issues.bleve", @@ -40,14 +41,15 @@ var Indexer = struct { IssueConnAuth: "", IssueIndexerName: "gitea_issues", - RepoIndexerEnabled: false, - RepoIndexerRepoTypes: []string{"sources", "forks", "mirrors", "templates"}, - RepoType: "bleve", - RepoPath: "indexers/repos.bleve", - RepoConnStr: "", - RepoIndexerName: "gitea_codes", - MaxIndexerFileSize: 1024 * 1024, - ExcludeVendored: true, + RepoIndexerEnabled: false, + RepoIndexerRepoTypes: []string{"sources", "forks", "mirrors", "templates"}, + RepoIndexerEnableFuzzy: false, + RepoType: "bleve", + RepoPath: "indexers/repos.bleve", + RepoConnStr: "", + RepoIndexerName: "gitea_codes", + MaxIndexerFileSize: 1024 * 1024, + ExcludeVendored: true, } type Glob struct { @@ -87,6 +89,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) { Indexer.RepoIndexerEnabled = sec.Key("REPO_INDEXER_ENABLED").MustBool(false) Indexer.RepoIndexerRepoTypes = strings.Split(sec.Key("REPO_INDEXER_REPO_TYPES").MustString("sources,forks,mirrors,templates"), ",") + Indexer.RepoIndexerEnableFuzzy = sec.Key("REPO_INDEXER_FUZZY_ENABLED").MustBool(false) Indexer.RepoType = sec.Key("REPO_INDEXER_TYPE").MustString("bleve") Indexer.RepoPath = filepath.ToSlash(sec.Key("REPO_INDEXER_PATH").MustString(filepath.ToSlash(filepath.Join(AppDataPath, "indexers/repos.bleve")))) if !filepath.IsAbs(Indexer.RepoPath) { diff --git a/options/locale_next/locale_en-US.json b/options/locale_next/locale_en-US.json index 13f2ed6f55..d9d23653cd 100644 --- a/options/locale_next/locale_en-US.json +++ b/options/locale_next/locale_en-US.json @@ -100,6 +100,8 @@ "repo.issue_indexer.title": "Issue Indexer", "search.milestone_kind": "Search milestones…", "search.syntax": "Search syntax", + "search.fuzzy": "Fuzzy", + "search.fuzzy_tooltip": "Include results is an approximate match to the search term", "repo.settings.push_mirror.branch_filter.label": "Branch filter (optional)", "repo.settings.push_mirror.branch_filter.description": "Branches to be mirrored. Leave blank to mirror all branches. See %[2]s documentation for syntax. Examples: main, release/*", "incorrect_root_url": "This Forgejo instance is configured to be served on \"%s\". You are currently viewing Forgejo through a different URL, which may cause parts of the application to break. The canonical URL is controlled by Forgejo admins via the ROOT_URL setting in the app.ini.", diff --git a/routers/web/explore/code.go b/routers/web/explore/code.go index 6697755c22..2e11f70585 100644 --- a/routers/web/explore/code.go +++ b/routers/web/explore/code.go @@ -38,10 +38,14 @@ func Code(ctx *context.Context) { path := ctx.FormTrim("path") mode := code_indexer.SearchModeExact - if m := ctx.FormTrim("mode"); m == "union" || - m == "fuzzy" || - ctx.FormBool("fuzzy") { + if m := ctx.FormTrim("mode"); m == "union" { mode = code_indexer.SearchModeUnion + } else if m == "fuzzy" || ctx.FormBool("fuzzy") { + if setting.Indexer.RepoIndexerEnableFuzzy { + mode = code_indexer.SearchModeFuzzy + } else { + mode = code_indexer.SearchModeUnion + } } ctx.Data["Keyword"] = keyword diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index ad10542c01..c3b4d07fa0 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -22,13 +22,16 @@ type searchMode int const ( ExactSearchMode searchMode = iota UnionSearchMode + FuzzySearchMode RegExpSearchMode ) func searchModeFromString(s string) searchMode { switch s { - case "fuzzy", "union": + case "union": return UnionSearchMode + case "fuzzy": + return FuzzySearchMode case "regexp": return RegExpSearchMode default: @@ -36,23 +39,13 @@ func searchModeFromString(s string) searchMode { } } -func (m searchMode) String() string { - switch m { - case ExactSearchMode: - return "exact" - case UnionSearchMode: - return "union" - case RegExpSearchMode: - return "regexp" - default: - panic("cannot happen") - } -} - func (m searchMode) ToIndexer() code_indexer.SearchMode { if m == ExactSearchMode { return code_indexer.SearchModeExact } + if setting.Indexer.RepoIndexerEnableFuzzy && m == FuzzySearchMode { + return code_indexer.SearchModeFuzzy + } return code_indexer.SearchModeUnion } @@ -83,7 +76,6 @@ func Search(ctx *context.Context) { ctx.Data["Keyword"] = keyword ctx.Data["Language"] = language ctx.Data["CodeSearchPath"] = path - ctx.Data["CodeSearchMode"] = mode.String() ctx.Data["PageIsViewCode"] = true ctx.Data["CodeIndexerDisabled"] = !setting.Indexer.RepoIndexerEnabled if setting.Indexer.RepoIndexerEnabled { @@ -106,11 +98,14 @@ func Search(ctx *context.Context) { var searchResults []*code_indexer.Result var searchResultLanguages []*code_indexer.SearchResultLanguages if setting.Indexer.RepoIndexerEnabled { + m := mode.ToIndexer() + ctx.Data["CodeSearchMode"] = m.String() + var err error total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ RepoIDs: []int64{ctx.Repo.Repository.ID}, Keyword: keyword, - Mode: mode.ToIndexer(), + Mode: m, Language: language, Filename: path, Paginator: &db.ListOptions{ @@ -128,11 +123,14 @@ func Search(ctx *context.Context) { ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx) } } else { + m := mode.ToGitGrep() + ctx.Data["CodeSearchMode"] = m.String() + res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{ ContextLineNumber: 1, RefName: ctx.Repo.RefName, Filename: path, - Mode: mode.ToGitGrep(), + Mode: m, }) if err != nil { ctx.ServerError("GrepSearch", err) diff --git a/routers/web/user/code.go b/routers/web/user/code.go index b5c5e54953..5c69d72d51 100644 --- a/routers/web/user/code.go +++ b/routers/web/user/code.go @@ -42,10 +42,14 @@ func CodeSearch(ctx *context.Context) { path := ctx.FormTrim("path") mode := code_indexer.SearchModeExact - if m := ctx.FormTrim("mode"); m == "union" || - m == "fuzzy" || - ctx.FormBool("fuzzy") { + if m := ctx.FormTrim("mode"); m == "union" { mode = code_indexer.SearchModeUnion + } else if m == "fuzzy" || ctx.FormBool("fuzzy") { + if setting.Indexer.RepoIndexerEnableFuzzy { + mode = code_indexer.SearchModeFuzzy + } else { + mode = code_indexer.SearchModeUnion + } } ctx.Data["Keyword"] = keyword diff --git a/tests/integration/explore_code_test.go b/tests/integration/explore_code_test.go index 97cc97b996..1e09b1ee8a 100644 --- a/tests/integration/explore_code_test.go +++ b/tests/integration/explore_code_test.go @@ -4,6 +4,7 @@ import ( "net/http" "testing" + code_indexer "forgejo.org/modules/indexer/code" "forgejo.org/modules/setting" "forgejo.org/modules/test" "forgejo.org/tests" @@ -16,11 +17,43 @@ func TestExploreCodeSearchIndexer(t *testing.T) { defer tests.PrepareTestEnv(t)() defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnabled, true)() - req := NewRequest(t, "GET", "/explore/code?q=file&fuzzy=true") - resp := MakeRequest(t, req, http.StatusOK) - doc := NewHTMLParser(t, resp.Body).Find(".explore") + t.Run("Exact", func(t *testing.T) { + req := NewRequest(t, "GET", "/explore/code?q=file&mode=exact") + resp := MakeRequest(t, req, http.StatusOK) + doc := NewHTMLParser(t, resp.Body).Find(".explore") - doc.Find(".file-body").Each(func(i int, sel *goquery.Selection) { - assert.Positive(t, sel.Find(".code-inner").Find(".search-highlight").Length(), 0) + active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value") + assert.True(t, ok) + assert.Equal(t, "exact", active) + + doc.Find(".file-body").Each(func(i int, sel *goquery.Selection) { + assert.Positive(t, sel.Find(".code-inner").Find(".search-highlight").Length()) + }) + }) + + t.Run("Fuzzy", func(t *testing.T) { + defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnableFuzzy, true)() + code_indexer.CodeSearchOptions = []string{"exact", "union", "fuzzy"} // usually set by Init + + req := NewRequest(t, "GET", "/explore/code?q=file&mode=fuzzy") + resp := MakeRequest(t, req, http.StatusOK) + doc := NewHTMLParser(t, resp.Body).Find(".explore") + + active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value") + assert.True(t, ok) + assert.Equal(t, "fuzzy", active) + }) + + t.Run("No Fuzzy", func(t *testing.T) { + defer test.MockVariableValue(&setting.Indexer.RepoIndexerEnableFuzzy, false)() + code_indexer.CodeSearchOptions = []string{"exact", "union"} // usually set by Init + + req := NewRequest(t, "GET", "/explore/code?q=file&mode=fuzzy") + resp := MakeRequest(t, req, http.StatusOK) + doc := NewHTMLParser(t, resp.Body).Find(".explore") + + active, ok := doc.Find("[data-test-tag=fuzzy-dropdown] .active input").Attr("value") + assert.True(t, ok) + assert.Equal(t, "union", active) }) }