mirror of
https://github.com/prometheus/prometheus.git
synced 2026-06-11 09:30:13 -04:00
strutil/subsequence: rank exact matches above non-exact matches
Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>
This commit is contained in:
parent
551b5b1c56
commit
a597d2250a
2 changed files with 37 additions and 23 deletions
|
|
@ -19,6 +19,9 @@ package strutil
|
|||
|
||||
import "strings"
|
||||
|
||||
// Non-exact matches are scaled below 1.0 so rounded scores stay distinguishable from exact matches.
|
||||
const subsequenceNonExactScoreScale = 0.999
|
||||
|
||||
// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
|
||||
// that it can be scored against many candidate strings without repeating the
|
||||
// ASCII check or rune conversion on the pattern for every call. The first
|
||||
|
|
@ -52,7 +55,7 @@ func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
|
|||
// text in more than one way.
|
||||
//
|
||||
// The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length).
|
||||
// The result is normalized by pattern_length² (the maximum possible raw score).
|
||||
// The result is normalized by pattern_length² and scaled below 1.0 for non-exact matches.
|
||||
func (m *SubsequenceMatcher) Score(text string) float64 {
|
||||
if m.pattern == "" {
|
||||
return 1.0
|
||||
|
|
@ -184,7 +187,7 @@ func matchSubsequenceString(pattern, text string) float64 {
|
|||
if bestScore < 0 {
|
||||
return 0.0
|
||||
}
|
||||
return bestScore / float64(patternLen*patternLen)
|
||||
return normalizeSubsequenceScore(bestScore, patternLen)
|
||||
}
|
||||
|
||||
// matchSubsequenceRunes implements the scoring algorithm over pre-converted
|
||||
|
|
@ -267,6 +270,10 @@ func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
|
|||
return 0.0
|
||||
}
|
||||
|
||||
// Normalize by pattern_length² (the maximum possible raw score).
|
||||
return bestScore / float64(patternLen*patternLen)
|
||||
return normalizeSubsequenceScore(bestScore, patternLen)
|
||||
}
|
||||
|
||||
func normalizeSubsequenceScore(rawScore float64, patternLen int) float64 {
|
||||
score := rawScore / float64(patternLen*patternLen)
|
||||
return score * subsequenceNonExactScoreScale
|
||||
}
|
||||
|
|
|
|||
|
|
@ -135,28 +135,35 @@ func TestSubsequenceScore(t *testing.T) {
|
|||
name: "prefix match",
|
||||
pattern: "my",
|
||||
text: "my awesome text",
|
||||
// intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
|
||||
wantScore: 107.0 / 120.0,
|
||||
// Intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
|
||||
wantScore: 107.0 / 120.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "substring match",
|
||||
pattern: "tex",
|
||||
text: "my awesome text",
|
||||
// intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
|
||||
wantScore: 247.0 / 270.0,
|
||||
// Intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
|
||||
wantScore: 247.0 / 270.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "fuzzy match picks best starting position",
|
||||
pattern: "met",
|
||||
text: "my awesome text",
|
||||
// intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
|
||||
wantScore: 43.0 / 90.0,
|
||||
// Intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
|
||||
wantScore: 43.0 / 90.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "prefers later position with better consecutive run",
|
||||
pattern: "bac",
|
||||
text: "babac",
|
||||
wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
|
||||
wantScore: 43.0 / 45.0 * 0.999, // Match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
|
||||
},
|
||||
{
|
||||
name: "longer prefix match stays below exact match",
|
||||
pattern: "handler1",
|
||||
text: "handler10",
|
||||
// Intervals [0,7], leading=0, trailing=1. raw = 64 - 1/18, normalized by 64 and scaled.
|
||||
wantScore: 1149849.0 / 1152000.0,
|
||||
},
|
||||
{
|
||||
name: "pattern longer than text",
|
||||
|
|
@ -192,8 +199,8 @@ func TestSubsequenceScore(t *testing.T) {
|
|||
name: "unicode prefix match",
|
||||
pattern: "éà",
|
||||
text: "éàü",
|
||||
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0,
|
||||
// Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "unicode no match",
|
||||
|
|
@ -211,16 +218,16 @@ func TestSubsequenceScore(t *testing.T) {
|
|||
name: "unicode fuzzy match with gap between intervals",
|
||||
pattern: "éü",
|
||||
text: "éàü",
|
||||
// intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
|
||||
// raw = 1 + 1 - 1/3, normalized by 4.
|
||||
wantScore: 5.0 / 12.0,
|
||||
// Intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
|
||||
// Raw = 1 + 1 - 1/3, normalized by 4.
|
||||
wantScore: 5.0 / 12.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "mixed ascii and unicode",
|
||||
pattern: "aé",
|
||||
text: "aéb",
|
||||
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0,
|
||||
// Intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "unicode chars sharing leading utf-8 byte do not match",
|
||||
|
|
@ -241,18 +248,18 @@ func TestSubsequenceScore(t *testing.T) {
|
|||
pattern: "oa",
|
||||
text: "goat",
|
||||
// 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1.
|
||||
// raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
|
||||
wantScore: 29.0 / 32.0,
|
||||
// Raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
|
||||
wantScore: 29.0 / 32.0 * 0.999,
|
||||
},
|
||||
{
|
||||
name: "repeated chars use greedy match",
|
||||
pattern: "abaa",
|
||||
text: "abbaa",
|
||||
// Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4].
|
||||
// raw = 2² + 2² - 1/5, normalized by 4² = 16.
|
||||
// Raw = 2² + 2² - 1/5, normalized by 4² = 16.
|
||||
// A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80,
|
||||
// but this test documents the current greedy behavior.
|
||||
wantScore: 39.0 / 80.0,
|
||||
wantScore: 39.0 / 80.0 * 0.999,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
@ -271,7 +278,7 @@ func TestSubsequenceScore(t *testing.T) {
|
|||
func TestSubsequenceScoreProperties(t *testing.T) {
|
||||
// Prefix match scores below 1.0; only exact match scores 1.0.
|
||||
// "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9.
|
||||
require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
|
||||
require.InDelta(t, 173.0/180.0*0.999, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
|
||||
|
||||
// Exact match always scores 1.0.
|
||||
require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus"))
|
||||
|
|
|
|||
Loading…
Reference in a new issue