strutil/subsequence: rank exact matches above non-exact matches

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>
This commit is contained in:
Julien Pivotto 2026-04-24 12:25:09 +02:00
parent 551b5b1c56
commit a597d2250a
2 changed files with 37 additions and 23 deletions

View file

@ -19,6 +19,9 @@ package strutil
import "strings"
// Non-exact matches are scaled below 1.0 so rounded scores stay distinguishable from exact matches.
const subsequenceNonExactScoreScale = 0.999
// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
// that it can be scored against many candidate strings without repeating the
// ASCII check or rune conversion on the pattern for every call. The first
@ -52,7 +55,7 @@ func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
// text in more than one way.
//
// The raw scoring formula is: Σ(interval_size²) Σ(gap_size / text_length) trailing_gap / (2 * text_length).
// The result is normalized by pattern_length² (the maximum possible raw score).
// The result is normalized by pattern_length² and scaled below 1.0 for non-exact matches.
func (m *SubsequenceMatcher) Score(text string) float64 {
if m.pattern == "" {
return 1.0
@ -184,7 +187,7 @@ func matchSubsequenceString(pattern, text string) float64 {
if bestScore < 0 {
return 0.0
}
return bestScore / float64(patternLen*patternLen)
return normalizeSubsequenceScore(bestScore, patternLen)
}
// matchSubsequenceRunes implements the scoring algorithm over pre-converted
@ -267,6 +270,10 @@ func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
return 0.0
}
// Normalize by pattern_length² (the maximum possible raw score).
return bestScore / float64(patternLen*patternLen)
return normalizeSubsequenceScore(bestScore, patternLen)
}
func normalizeSubsequenceScore(rawScore float64, patternLen int) float64 {
score := rawScore / float64(patternLen*patternLen)
return score * subsequenceNonExactScoreScale
}

View file

@ -135,28 +135,35 @@ func TestSubsequenceScore(t *testing.T) {
name: "prefix match",
pattern: "my",
text: "my awesome text",
// intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
wantScore: 107.0 / 120.0,
// Intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
wantScore: 107.0 / 120.0 * 0.999,
},
{
name: "substring match",
pattern: "tex",
text: "my awesome text",
// intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
wantScore: 247.0 / 270.0,
// Intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
wantScore: 247.0 / 270.0 * 0.999,
},
{
name: "fuzzy match picks best starting position",
pattern: "met",
text: "my awesome text",
// intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
wantScore: 43.0 / 90.0,
// Intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
wantScore: 43.0 / 90.0 * 0.999,
},
{
name: "prefers later position with better consecutive run",
pattern: "bac",
text: "babac",
wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
wantScore: 43.0 / 45.0 * 0.999, // Match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
},
{
name: "longer prefix match stays below exact match",
pattern: "handler1",
text: "handler10",
// Intervals [0,7], leading=0, trailing=1. raw = 64 - 1/18, normalized by 64 and scaled.
wantScore: 1149849.0 / 1152000.0,
},
{
name: "pattern longer than text",
@ -192,8 +199,8 @@ func TestSubsequenceScore(t *testing.T) {
name: "unicode prefix match",
pattern: "éà",
text: "éàü",
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
wantScore: 23.0 / 24.0,
// Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
wantScore: 23.0 / 24.0 * 0.999,
},
{
name: "unicode no match",
@ -211,16 +218,16 @@ func TestSubsequenceScore(t *testing.T) {
name: "unicode fuzzy match with gap between intervals",
pattern: "éü",
text: "éàü",
// intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
// raw = 1 + 1 - 1/3, normalized by 4.
wantScore: 5.0 / 12.0,
// Intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
// Raw = 1 + 1 - 1/3, normalized by 4.
wantScore: 5.0 / 12.0 * 0.999,
},
{
name: "mixed ascii and unicode",
pattern: "aé",
text: "aéb",
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
wantScore: 23.0 / 24.0,
// Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
wantScore: 23.0 / 24.0 * 0.999,
},
{
name: "unicode chars sharing leading utf-8 byte do not match",
@ -241,18 +248,18 @@ func TestSubsequenceScore(t *testing.T) {
pattern: "oa",
text: "goat",
// 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1.
// raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
wantScore: 29.0 / 32.0,
// Raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
wantScore: 29.0 / 32.0 * 0.999,
},
{
name: "repeated chars use greedy match",
pattern: "abaa",
text: "abbaa",
// Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4].
// raw = 2² + 2² - 1/5, normalized by 4² = 16.
// Raw = 2² + 2² - 1/5, normalized by 4² = 16.
// A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80,
// but this test documents the current greedy behavior.
wantScore: 39.0 / 80.0,
wantScore: 39.0 / 80.0 * 0.999,
},
}
@ -271,7 +278,7 @@ func TestSubsequenceScore(t *testing.T) {
func TestSubsequenceScoreProperties(t *testing.T) {
// Prefix match scores below 1.0; only exact match scores 1.0.
// "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9.
require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
require.InDelta(t, 173.0/180.0*0.999, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
// Exact match always scores 1.0.
require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus"))