From a597d2250aee0ef7ce4fafbcaffa2a330b39cf6f Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:25:09 +0200 Subject: [PATCH] strutil/subsequence: rank exact matches above non-exact matches Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- util/strutil/subsequence.go | 15 ++++++++--- util/strutil/subsequence_test.go | 45 ++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/util/strutil/subsequence.go b/util/strutil/subsequence.go index 767637e99c..10c8c1e198 100644 --- a/util/strutil/subsequence.go +++ b/util/strutil/subsequence.go @@ -19,6 +19,9 @@ package strutil import "strings" +// Non-exact matches are scaled below 1.0 so rounded scores stay distinguishable from exact matches. +const subsequenceNonExactScoreScale = 0.999 + // SubsequenceMatcher pre-computes the encoding of a fixed search pattern so // that it can be scored against many candidate strings without repeating the // ASCII check or rune conversion on the pattern for every call. The first @@ -52,7 +55,7 @@ func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher { // text in more than one way. // // The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length). -// The result is normalized by pattern_length² (the maximum possible raw score). +// The result is normalized by pattern_length² and scaled below 1.0 for non-exact matches. func (m *SubsequenceMatcher) Score(text string) float64 { if m.pattern == "" { return 1.0 @@ -184,7 +187,7 @@ func matchSubsequenceString(pattern, text string) float64 { if bestScore < 0 { return 0.0 } - return bestScore / float64(patternLen*patternLen) + return normalizeSubsequenceScore(bestScore, patternLen) } // matchSubsequenceRunes implements the scoring algorithm over pre-converted @@ -267,6 +270,10 @@ func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 { return 0.0 } - // Normalize by pattern_length² (the maximum possible raw score). - return bestScore / float64(patternLen*patternLen) + return normalizeSubsequenceScore(bestScore, patternLen) +} + +func normalizeSubsequenceScore(rawScore float64, patternLen int) float64 { + score := rawScore / float64(patternLen*patternLen) + return score * subsequenceNonExactScoreScale } diff --git a/util/strutil/subsequence_test.go b/util/strutil/subsequence_test.go index 00ac62eca4..bf21914a2d 100644 --- a/util/strutil/subsequence_test.go +++ b/util/strutil/subsequence_test.go @@ -135,28 +135,35 @@ func TestSubsequenceScore(t *testing.T) { name: "prefix match", pattern: "my", text: "my awesome text", - // intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4. - wantScore: 107.0 / 120.0, + // Intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4. + wantScore: 107.0 / 120.0 * 0.999, }, { name: "substring match", pattern: "tex", text: "my awesome text", - // intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9. - wantScore: 247.0 / 270.0, + // Intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9. + wantScore: 247.0 / 270.0 * 0.999, }, { name: "fuzzy match picks best starting position", pattern: "met", text: "my awesome text", - // intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9. - wantScore: 43.0 / 90.0, + // Intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9. + wantScore: 43.0 / 90.0 * 0.999, }, { name: "prefers later position with better consecutive run", pattern: "bac", text: "babac", - wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9. + wantScore: 43.0 / 45.0 * 0.999, // Match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9. + }, + { + name: "longer prefix match stays below exact match", + pattern: "handler1", + text: "handler10", + // Intervals [0,7], leading=0, trailing=1. raw = 64 - 1/18, normalized by 64 and scaled. + wantScore: 1149849.0 / 1152000.0, }, { name: "pattern longer than text", @@ -192,8 +199,8 @@ func TestSubsequenceScore(t *testing.T) { name: "unicode prefix match", pattern: "éà", text: "éàü", - // intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. - wantScore: 23.0 / 24.0, + // Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. + wantScore: 23.0 / 24.0 * 0.999, }, { name: "unicode no match", @@ -211,16 +218,16 @@ func TestSubsequenceScore(t *testing.T) { name: "unicode fuzzy match with gap between intervals", pattern: "éü", text: "éàü", - // intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0. - // raw = 1 + 1 - 1/3, normalized by 4. - wantScore: 5.0 / 12.0, + // Intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0. + // Raw = 1 + 1 - 1/3, normalized by 4. + wantScore: 5.0 / 12.0 * 0.999, }, { name: "mixed ascii and unicode", pattern: "aé", text: "aéb", - // intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. - wantScore: 23.0 / 24.0, + // Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. + wantScore: 23.0 / 24.0 * 0.999, }, { name: "unicode chars sharing leading utf-8 byte do not match", @@ -241,18 +248,18 @@ func TestSubsequenceScore(t *testing.T) { pattern: "oa", text: "goat", // 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1. - // raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4. - wantScore: 29.0 / 32.0, + // Raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4. + wantScore: 29.0 / 32.0 * 0.999, }, { name: "repeated chars use greedy match", pattern: "abaa", text: "abbaa", // Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4]. - // raw = 2² + 2² - 1/5, normalized by 4² = 16. + // Raw = 2² + 2² - 1/5, normalized by 4² = 16. // A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80, // but this test documents the current greedy behavior. - wantScore: 39.0 / 80.0, + wantScore: 39.0 / 80.0 * 0.999, }, } @@ -271,7 +278,7 @@ func TestSubsequenceScore(t *testing.T) { func TestSubsequenceScoreProperties(t *testing.T) { // Prefix match scores below 1.0; only exact match scores 1.0. // "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9. - require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9) + require.InDelta(t, 173.0/180.0*0.999, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9) // Exact match always scores 1.0. require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus"))